{'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': 'simple', 'log_file': 'chkpt/en-ja/train.log', 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': 'wmt23', 'azureml_logging': False, 'seed': 0, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 8, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 8, 'distributed_num_procs': 8, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': 'tcp://localhost:33095', 'distributed_port': 33095, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'pytorch_ddp', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': False, 'gradient_as_bucket_view': False, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_base_algorithm': 'localsgd', 'localsgd_frequency': 3, 'nprocs_per_node': 8, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': True, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False, 'not_fsdp_flatten_parameters': False}, 'dataset': {'_name': None, 'num_workers': 0, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': 16384, 'batch_size': None, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 100000, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': 16384, 'batch_size_valid': None, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0, 'grouped_shuffling': False, 'update_epoch_batch_itr': False, 'update_ordered_indices_seed': False}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 60000, 'stop_time_hours': 0.0, 'clip_norm': 1.0, 'sentence_avg': False, 'update_freq': [4], 'lr': [0.001], 'stop_min_lr': -1.0, 'use_bmuf': False, 'skip_remainder_batch': False, 'debug_param_names': False}, 'checkpoint': {'_name': None, 'save_dir': 'chkpt/en-ja', 'restore_file': 'checkpoint_last.pt', 'continue_once': None, 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 100000, 'save_interval_updates': 1000, 'keep_interval_updates': 10, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': -1, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'loss', 'maximize_best_checkpoint_metric': False, 'patience': -1, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 8}, 'generation': {'_name': None, 'beam': 5, 'beam_mt': 0, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'max_len_a_mt': 0.0, 'max_len_b_mt': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'lenpen_mt': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False, 'eos_token': None}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': Namespace(no_progress_bar=False, log_interval=100, log_format='simple', log_file='chkpt/en-ja/train.log', aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project='wmt23', azureml_logging=False, seed=0, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=True, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=8, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='label_smoothed_cross_entropy', tokenizer=None, bpe=None, optimizer='adam', lr_scheduler='inverse_sqrt', scoring='bleu', task='translation', num_workers=0, skip_invalid_size_inputs_valid_test=False, max_tokens=16384, batch_size=None, required_batch_size_multiple=8, required_seq_len_multiple=1, dataset_impl=None, data_buffer_size=10, train_subset='train', valid_subset='valid', combine_valid_subsets=None, ignore_unused_valid_subsets=False, validate_interval=100000, validate_interval_updates=0, validate_after_updates=0, fixed_validation_seed=None, disable_validation=False, max_tokens_valid=16384, batch_size_valid=None, max_valid_steps=None, curriculum=0, gen_subset='test', num_shards=1, shard_id=0, grouped_shuffling=False, update_epoch_batch_itr=False, update_ordered_indices_seed=False, distributed_world_size=8, distributed_num_procs=8, distributed_rank=0, distributed_backend='nccl', distributed_init_method=None, distributed_port=-1, device_id=0, distributed_no_spawn=False, ddp_backend='pytorch_ddp', ddp_comm_hook='none', bucket_cap_mb=25, fix_batches_to_gpus=False, find_unused_parameters=False, gradient_as_bucket_view=False, fast_stat_sync=False, heartbeat_timeout=-1, broadcast_buffers=False, slowmo_momentum=None, slowmo_base_algorithm='localsgd', localsgd_frequency=3, nprocs_per_node=8, pipeline_model_parallel=False, pipeline_balance=None, pipeline_devices=None, pipeline_chunks=0, pipeline_encoder_balance=None, pipeline_encoder_devices=None, pipeline_decoder_balance=None, pipeline_decoder_devices=None, pipeline_checkpoint='never', zero_sharding='none', no_reshard_after_forward=False, fp32_reduce_scatter=False, cpu_offload=False, use_sharded_state=False, not_fsdp_flatten_parameters=False, arch='transformer_vaswani_wmt_en_de_big', max_epoch=0, max_update=60000, stop_time_hours=0, clip_norm=1.0, sentence_avg=False, update_freq=[4], lr=[0.001], stop_min_lr=-1.0, use_bmuf=False, skip_remainder_batch=False, debug_param_names=False, save_dir='chkpt/en-ja', restore_file='checkpoint_last.pt', continue_once=None, finetune_from_model=None, reset_dataloader=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, optimizer_overrides='{}', save_interval=100000, save_interval_updates=1000, keep_interval_updates=10, keep_interval_updates_pattern=-1, keep_last_epochs=-1, keep_best_checkpoints=-1, no_save=False, no_epoch_checkpoints=True, no_last_checkpoints=False, no_save_optimizer_state=False, best_checkpoint_metric='loss', maximize_best_checkpoint_metric=False, patience=-1, checkpoint_suffix='', checkpoint_shard_count=1, load_checkpoint_on_all_dp_ranks=False, write_checkpoints_asynchronously=False, store_ema=False, ema_decay=0.9999, ema_start_update=0, ema_seed_model=None, ema_update_freq=1, ema_fp32=False, data='binarized/en-ja/', source_lang=None, target_lang=None, load_alignments=False, left_pad_source=True, left_pad_target=False, upsample_primary=-1, truncate_source=False, num_batch_buckets=0, eval_bleu=False, eval_bleu_args='{}', eval_bleu_detok='space', eval_bleu_detok_args='{}', eval_tokenized_bleu=False, eval_bleu_remove_bpe=None, eval_bleu_print_samples=False, label_smoothing=0.1, report_accuracy=False, ignore_prefix_size=0, adam_betas='(0.9, 0.98)', adam_eps=1e-08, weight_decay=0.0, use_old_adam=False, fp16_adam_stats=False, warmup_updates=4000, warmup_init_lr=-1, pad=1, eos=2, unk=3, encoder_ffn_embed_dim=8192, decoder_ffn_embed_dim=8192, dropout=0.1, share_decoder_input_output_embed=True, no_seed_provided=False, encoder_embed_dim=1024, encoder_attention_heads=16, encoder_normalize_before=False, decoder_embed_dim=1024, decoder_attention_heads=16, encoder_embed_path=None, encoder_layers=6, encoder_learned_pos=False, decoder_embed_path=None, decoder_layers=6, decoder_normalize_before=False, decoder_learned_pos=False, attention_dropout=0.0, activation_dropout=0.0, activation_fn='relu', adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, share_all_embeddings=False, merge_src_tgt_embed=False, no_token_positional_embeddings=False, adaptive_input=False, no_cross_attention=False, cross_self_attention=False, decoder_output_dim=1024, decoder_input_dim=1024, no_scale_embedding=False, layernorm_embedding=False, tie_adaptive_weights=False, checkpoint_activations=False, offload_activations=False, encoder_layers_to_keep=None, decoder_layers_to_keep=None, encoder_layerdrop=0, decoder_layerdrop=0, quant_noise_pq=0, quant_noise_pq_block_size=8, quant_noise_scalar=0, _name='transformer_vaswani_wmt_en_de_big'), 'task': {'_name': 'translation', 'data': 'binarized/en-ja/', 'source_lang': None, 'target_lang': None, 'load_alignments': False, 'left_pad_source': True, 'left_pad_target': False, 'max_source_positions': 1024, 'max_target_positions': 1024, 'upsample_primary': -1, 'truncate_source': False, 'num_batch_buckets': 0, 'train_subset': 'train', 'dataset_impl': None, 'required_seq_len_multiple': 1, 'eval_bleu': False, 'eval_bleu_args': '{}', 'eval_bleu_detok': 'space', 'eval_bleu_detok_args': '{}', 'eval_tokenized_bleu': False, 'eval_bleu_remove_bpe': None, 'eval_bleu_print_samples': False}, 'criterion': {'_name': 'label_smoothed_cross_entropy', 'label_smoothing': 0.1, 'report_accuracy': False, 'ignore_prefix_size': 0, 'sentence_avg': False}, 'optimizer': {'_name': 'adam', 'adam_betas': '(0.9, 0.98)', 'adam_eps': 1e-08, 'weight_decay': 0.0, 'use_old_adam': False, 'fp16_adam_stats': False, 'tpu': False, 'lr': [0.001]}, 'lr_scheduler': {'_name': 'inverse_sqrt', 'warmup_updates': 4000, 'warmup_init_lr': -1.0, 'lr': [0.001]}, 'scoring': {'_name': 'bleu', 'pad': 1, 'eos': 2, 'unk': 3}, 'bpe': None, 'tokenizer': None, 'ema': {'_name': None, 'store_ema': False, 'ema_decay': 0.9999, 'ema_start_update': 0, 'ema_seed_model': None, 'ema_update_freq': 1, 'ema_fp32': False}} TransformerModel( (encoder): TransformerEncoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(16000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-5): 6 x TransformerEncoderLayerBase( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (dropout_module): FairseqDropout() (activation_dropout_module): FairseqDropout() (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) ) (decoder): TransformerDecoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(32000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-5): 6 x TransformerDecoderLayerBase( (dropout_module): FairseqDropout() (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (activation_dropout_module): FairseqDropout() (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (encoder_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (encoder_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) (output_projection): Linear(in_features=1024, out_features=32000, bias=False) ) ) task: TranslationTask model: TransformerModel criterion: LabelSmoothedCrossEntropyCriterion num. shared model params: 326,221,824 (num. trained: 326,221,824) num. expert model params: 0 (num. trained: 0) training on 8 devices (GPUs/TPUs) max tokens per device = 16384 and max sentences per device = None begin dry-run validation on "valid" subset Start iterating over samples epoch 001: 102 / 1689 loss=13.05, nll_loss=12.736, ppl=6820.16, wps=452324, ups=1.04, wpb=434942, bsz=16577.2, num_updates=100, lr=2.5e-05, gnorm=3.521, clip=95, loss_scale=2, train_wall=100, gb_free=21.7, wall=118 epoch 001: 202 / 1689 loss=11.627, nll_loss=11.122, ppl=2228.38, wps=460671, ups=1.06, wpb=435200, bsz=16988, num_updates=200, lr=5e-05, gnorm=2.021, clip=98, loss_scale=2, train_wall=93, gb_free=19.7, wall=213 epoch 001: 302 / 1689 loss=11.032, nll_loss=10.418, ppl=1368.51, wps=457881, ups=1.05, wpb=434886, bsz=16709.5, num_updates=300, lr=7.5e-05, gnorm=1.735, clip=98, loss_scale=2, train_wall=94, gb_free=18.9, wall=308 epoch 001: 402 / 1689 loss=10.31, nll_loss=9.562, ppl=755.74, wps=459108, ups=1.06, wpb=432640, bsz=16420.4, num_updates=400, lr=0.0001, gnorm=1.699, clip=99, loss_scale=2, train_wall=93, gb_free=16.6, wall=402 epoch 001: 502 / 1689 loss=9.734, nll_loss=8.876, ppl=469.8, wps=454833, ups=1.05, wpb=433247, bsz=16495.6, num_updates=500, lr=0.000125, gnorm=1.667, clip=99, loss_scale=2, train_wall=94, gb_free=19.3, wall=497 epoch 001: 602 / 1689 loss=9.253, nll_loss=8.308, ppl=316.96, wps=458754, ups=1.06, wpb=433091, bsz=16520.5, num_updates=600, lr=0.00015, gnorm=1.566, clip=99, loss_scale=4, train_wall=93, gb_free=18.7, wall=592 epoch 001: 702 / 1689 loss=8.825, nll_loss=7.804, ppl=223.53, wps=459774, ups=1.06, wpb=434503, bsz=16405.3, num_updates=700, lr=0.000175, gnorm=1.424, clip=99, loss_scale=4, train_wall=93, gb_free=19.4, wall=686 epoch 001: 802 / 1689 loss=8.42, nll_loss=7.332, ppl=161.07, wps=457694, ups=1.05, wpb=434941, bsz=16333.4, num_updates=800, lr=0.0002, gnorm=1.261, clip=91, loss_scale=4, train_wall=94, gb_free=21.3, wall=781 epoch 001: 902 / 1689 loss=8.039, nll_loss=6.888, ppl=118.45, wps=455480, ups=1.05, wpb=434996, bsz=16571.2, num_updates=900, lr=0.000225, gnorm=1.201, clip=91, loss_scale=4, train_wall=94, gb_free=18.8, wall=877 epoch 001: 1002 / 1689 loss=7.694, nll_loss=6.488, ppl=89.76, wps=451446, ups=1.05, wpb=431938, bsz=16755.5, num_updates=1000, lr=0.00025, gnorm=1.159, clip=78, loss_scale=4, train_wall=94, gb_free=19.7, wall=972 begin validation on "valid" subset epoch 001 | valid on 'valid' subset | loss 7.442 | nll_loss 6.174 | ppl 72.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 1000 epoch 001: 1102 / 1689 loss=7.326, nll_loss=6.062, ppl=66.82, wps=386358, ups=0.9, wpb=430204, bsz=16536.8, num_updates=1100, lr=0.000275, gnorm=1.103, clip=69, loss_scale=8, train_wall=93, gb_free=18.6, wall=1084 epoch 001: 1202 / 1689 loss=6.997, nll_loss=5.682, ppl=51.35, wps=458097, ups=1.06, wpb=433198, bsz=16274.6, num_updates=1200, lr=0.0003, gnorm=1.053, clip=63, loss_scale=8, train_wall=93, gb_free=20.5, wall=1178 epoch 001: 1302 / 1689 loss=6.656, nll_loss=5.291, ppl=39.16, wps=458212, ups=1.06, wpb=432602, bsz=16372.2, num_updates=1300, lr=0.000325, gnorm=0.991, clip=41, loss_scale=8, train_wall=93, gb_free=20.6, wall=1273 epoch 001: 1402 / 1689 loss=6.39, nll_loss=4.989, ppl=31.77, wps=457072, ups=1.05, wpb=434746, bsz=16342.9, num_updates=1400, lr=0.00035, gnorm=0.875, clip=22, loss_scale=8, train_wall=94, gb_free=19.1, wall=1368 epoch 001: 1502 / 1689 loss=6.168, nll_loss=4.738, ppl=26.68, wps=456810, ups=1.05, wpb=435243, bsz=16512.3, num_updates=1500, lr=0.000375, gnorm=0.807, clip=13, loss_scale=8, train_wall=93, gb_free=18.6, wall=1463 epoch 001: 1602 / 1689 loss=5.985, nll_loss=4.533, ppl=23.15, wps=458294, ups=1.06, wpb=433941, bsz=16442.2, num_updates=1600, lr=0.0004, gnorm=0.692, clip=11, loss_scale=16, train_wall=93, gb_free=19.2, wall=1558 end of epoch 1 (average epoch stats below) epoch 001 | loss 8.458 | nll_loss 7.395 | ppl 168.26 | wps 451711 | ups 1.04 | wpb 433513 | bsz 16502.8 | num_updates 1686 | lr 0.0004215 | gnorm 1.387 | clip 69.6 | loss_scale 8 | train_wall 1583 | gb_free 21.1 | wall 1640 Start iterating over samples epoch 002: 14 / 1689 loss=5.856, nll_loss=4.388, ppl=20.94, wps=443599, ups=1.03, wpb=429086, bsz=16331, num_updates=1700, lr=0.000425, gnorm=0.711, clip=9, loss_scale=8, train_wall=95, gb_free=20, wall=1655 epoch 002: 14 / 1689 loss=5.856, nll_loss=4.388, ppl=20.94, wps=443599, ups=1.03, wpb=429086, bsz=16331, num_updates=1700, lr=0.000425, gnorm=0.711, clip=9, loss_scale=8, train_wall=95, gb_free=20, wall=1655 epoch 002: 114 / 1689 loss=5.703, nll_loss=4.217, ppl=18.6, wps=453302, ups=1.04, wpb=433815, bsz=16765.6, num_updates=1800, lr=0.00045, gnorm=0.583, clip=1, loss_scale=8, train_wall=94, gb_free=19.9, wall=1750 epoch 002: 114 / 1689 loss=5.703, nll_loss=4.217, ppl=18.6, wps=453302, ups=1.04, wpb=433815, bsz=16765.6, num_updates=1800, lr=0.00045, gnorm=0.583, clip=1, loss_scale=8, train_wall=94, gb_free=19.9, wall=1750 epoch 002: 215 / 1689 loss=5.632, nll_loss=4.14, ppl=17.63, wps=448523, ups=1.03, wpb=434143, bsz=16496.9, num_updates=1900, lr=0.000475, gnorm=0.61, clip=7, loss_scale=4, train_wall=95, gb_free=20.3, wall=1847 epoch 002: 215 / 1689 loss=5.632, nll_loss=4.14, ppl=17.63, wps=448523, ups=1.03, wpb=434143, bsz=16496.9, num_updates=1900, lr=0.000475, gnorm=0.61, clip=7, loss_scale=4, train_wall=95, gb_free=20.3, wall=1847 epoch 002: 315 / 1689 loss=5.514, nll_loss=4.009, ppl=16.1, wps=459311, ups=1.06, wpb=434889, bsz=16749.4, num_updates=2000, lr=0.0005, gnorm=0.53, clip=5, loss_scale=4, train_wall=93, gb_free=19.2, wall=1942 epoch 002: 315 / 1689 loss=5.514, nll_loss=4.009, ppl=16.1, wps=459311, ups=1.06, wpb=434889, bsz=16749.4, num_updates=2000, lr=0.0005, gnorm=0.53, clip=5, loss_scale=4, train_wall=93, gb_free=19.2, wall=1942 begin validation on "valid" subset epoch 002 | valid on 'valid' subset | loss 5.415 | nll_loss 3.848 | ppl 14.4 | wps 0 | wpb 42662 | bsz 2032 | num_updates 2000 | best_loss 5.415 epoch 002 | valid on 'valid' subset | loss 5.415 | nll_loss 3.848 | ppl 14.4 | wps 0 | wpb 42662 | bsz 2032 | num_updates 2000 | best_loss 5.415 epoch 002: 415 / 1689 loss=5.441, nll_loss=3.929, ppl=15.24, wps=385668, ups=0.89, wpb=432563, bsz=16241.9, num_updates=2100, lr=0.000525, gnorm=0.553, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=2054 epoch 002: 415 / 1689 loss=5.441, nll_loss=3.929, ppl=15.24, wps=385668, ups=0.89, wpb=432563, bsz=16241.9, num_updates=2100, lr=0.000525, gnorm=0.553, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=2054 epoch 002: 515 / 1689 loss=5.372, nll_loss=3.854, ppl=14.46, wps=456964, ups=1.05, wpb=435041, bsz=16516.2, num_updates=2200, lr=0.00055, gnorm=0.486, clip=3, loss_scale=4, train_wall=93, gb_free=19.5, wall=2149 epoch 002: 515 / 1689 loss=5.372, nll_loss=3.854, ppl=14.46, wps=456964, ups=1.05, wpb=435041, bsz=16516.2, num_updates=2200, lr=0.00055, gnorm=0.486, clip=3, loss_scale=4, train_wall=93, gb_free=19.5, wall=2149 epoch 002: 616 / 1689 loss=5.282, nll_loss=3.755, ppl=13.5, wps=448606, ups=1.04, wpb=432524, bsz=16610.3, num_updates=2300, lr=0.000575, gnorm=0.445, clip=2, loss_scale=2, train_wall=95, gb_free=20, wall=2245 epoch 002: 616 / 1689 loss=5.282, nll_loss=3.755, ppl=13.5, wps=448606, ups=1.04, wpb=432524, bsz=16610.3, num_updates=2300, lr=0.000575, gnorm=0.445, clip=2, loss_scale=2, train_wall=95, gb_free=20, wall=2245 epoch 002: 716 / 1689 loss=5.211, nll_loss=3.678, ppl=12.8, wps=455007, ups=1.05, wpb=434305, bsz=16174.7, num_updates=2400, lr=0.0006, gnorm=0.453, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=2341 epoch 002: 716 / 1689 loss=5.211, nll_loss=3.678, ppl=12.8, wps=455007, ups=1.05, wpb=434305, bsz=16174.7, num_updates=2400, lr=0.0006, gnorm=0.453, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=2341 epoch 002: 816 / 1689 loss=5.168, nll_loss=3.632, ppl=12.4, wps=456798, ups=1.06, wpb=431438, bsz=16804.4, num_updates=2500, lr=0.000625, gnorm=0.47, clip=1, loss_scale=2, train_wall=93, gb_free=19.3, wall=2435 epoch 002: 816 / 1689 loss=5.168, nll_loss=3.632, ppl=12.4, wps=456798, ups=1.06, wpb=431438, bsz=16804.4, num_updates=2500, lr=0.000625, gnorm=0.47, clip=1, loss_scale=2, train_wall=93, gb_free=19.3, wall=2435 epoch 002: 916 / 1689 loss=5.11, nll_loss=3.569, ppl=11.87, wps=458451, ups=1.06, wpb=434226, bsz=16532.2, num_updates=2600, lr=0.00065, gnorm=0.379, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=2530 epoch 002: 916 / 1689 loss=5.11, nll_loss=3.569, ppl=11.87, wps=458451, ups=1.06, wpb=434226, bsz=16532.2, num_updates=2600, lr=0.00065, gnorm=0.379, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=2530 epoch 002: 1016 / 1689 loss=5.052, nll_loss=3.505, ppl=11.36, wps=459035, ups=1.06, wpb=432834, bsz=16389.8, num_updates=2700, lr=0.000675, gnorm=0.415, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=2624 epoch 002: 1016 / 1689 loss=5.052, nll_loss=3.505, ppl=11.36, wps=459035, ups=1.06, wpb=432834, bsz=16389.8, num_updates=2700, lr=0.000675, gnorm=0.415, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=2624 epoch 002: 1116 / 1689 loss=5.021, nll_loss=3.473, ppl=11.1, wps=457368, ups=1.05, wpb=434616, bsz=16381.1, num_updates=2800, lr=0.0007, gnorm=0.393, clip=0, loss_scale=4, train_wall=94, gb_free=21.7, wall=2719 epoch 002: 1116 / 1689 loss=5.021, nll_loss=3.473, ppl=11.1, wps=457368, ups=1.05, wpb=434616, bsz=16381.1, num_updates=2800, lr=0.0007, gnorm=0.393, clip=0, loss_scale=4, train_wall=94, gb_free=21.7, wall=2719 epoch 002: 1216 / 1689 loss=4.972, nll_loss=3.42, ppl=10.7, wps=459228, ups=1.06, wpb=434473, bsz=16578.2, num_updates=2900, lr=0.000725, gnorm=0.376, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=2814 epoch 002: 1216 / 1689 loss=4.972, nll_loss=3.42, ppl=10.7, wps=459228, ups=1.06, wpb=434473, bsz=16578.2, num_updates=2900, lr=0.000725, gnorm=0.376, clip=0, loss_scale=4, train_wall=93, gb_free=18.9, wall=2814 epoch 002: 1316 / 1689 loss=4.935, nll_loss=3.381, ppl=10.41, wps=455316, ups=1.05, wpb=432917, bsz=16800.8, num_updates=3000, lr=0.00075, gnorm=0.382, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=2909 epoch 002: 1316 / 1689 loss=4.935, nll_loss=3.381, ppl=10.41, wps=455316, ups=1.05, wpb=432917, bsz=16800.8, num_updates=3000, lr=0.00075, gnorm=0.382, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=2909 begin validation on "valid" subset epoch 002 | valid on 'valid' subset | loss 4.873 | nll_loss 3.288 | ppl 9.77 | wps 0 | wpb 42662 | bsz 2032 | num_updates 3000 | best_loss 4.873 epoch 002 | valid on 'valid' subset | loss 4.873 | nll_loss 3.288 | ppl 9.77 | wps 0 | wpb 42662 | bsz 2032 | num_updates 3000 | best_loss 4.873 epoch 002: 1416 / 1689 loss=4.901, nll_loss=3.343, ppl=10.15, wps=379267, ups=0.87, wpb=433486, bsz=16253.3, num_updates=3100, lr=0.000775, gnorm=0.37, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=3023 epoch 002: 1416 / 1689 loss=4.901, nll_loss=3.343, ppl=10.15, wps=379267, ups=0.87, wpb=433486, bsz=16253.3, num_updates=3100, lr=0.000775, gnorm=0.37, clip=0, loss_scale=4, train_wall=93, gb_free=19.5, wall=3023 epoch 002: 1516 / 1689 loss=4.89, nll_loss=3.333, ppl=10.07, wps=460715, ups=1.06, wpb=433644, bsz=16473, num_updates=3200, lr=0.0008, gnorm=0.375, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=3118 epoch 002: 1516 / 1689 loss=4.89, nll_loss=3.333, ppl=10.07, wps=460715, ups=1.06, wpb=433644, bsz=16473, num_updates=3200, lr=0.0008, gnorm=0.375, clip=0, loss_scale=4, train_wall=92, gb_free=19.1, wall=3118 epoch 002: 1617 / 1689 loss=4.843, nll_loss=3.281, ppl=9.72, wps=454430, ups=1.05, wpb=433866, bsz=16340.2, num_updates=3300, lr=0.000825, gnorm=0.367, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=3213 epoch 002: 1617 / 1689 loss=4.843, nll_loss=3.281, ppl=9.72, wps=454430, ups=1.05, wpb=433866, bsz=16340.2, num_updates=3300, lr=0.000825, gnorm=0.367, clip=0, loss_scale=4, train_wall=94, gb_free=18.7, wall=3213 end of epoch 2 (average epoch stats below) epoch 002 | loss 5.181 | nll_loss 3.647 | ppl 12.53 | wps 445609 | ups 1.03 | wpb 433514 | bsz 16501.5 | num_updates 3372 | lr 0.000843 | gnorm 0.449 | clip 1.2 | loss_scale 4 | train_wall 1575 | gb_free 20.5 | wall 3280 epoch 002 | loss 5.181 | nll_loss 3.647 | ppl 12.53 | wps 445609 | ups 1.03 | wpb 433514 | bsz 16501.5 | num_updates 3372 | lr 0.000843 | gnorm 0.449 | clip 1.2 | loss_scale 4 | train_wall 1575 | gb_free 20.5 | wall 3280 Start iterating over samples epoch 003: 28 / 1689 loss=4.833, nll_loss=3.271, ppl=9.65, wps=458306, ups=1.06, wpb=430840, bsz=16401.4, num_updates=3400, lr=0.00085, gnorm=0.367, clip=0, loss_scale=4, train_wall=92, gb_free=18.1, wall=3307 epoch 003: 28 / 1689 loss=4.833, nll_loss=3.271, ppl=9.65, wps=458306, ups=1.06, wpb=430840, bsz=16401.4, num_updates=3400, lr=0.00085, gnorm=0.367, clip=0, loss_scale=4, train_wall=92, gb_free=18.1, wall=3307 epoch 003: 28 / 1689 loss=4.833, nll_loss=3.271, ppl=9.65, wps=458306, ups=1.06, wpb=430840, bsz=16401.4, num_updates=3400, lr=0.00085, gnorm=0.367, clip=0, loss_scale=4, train_wall=92, gb_free=18.1, wall=3307 epoch 003: 128 / 1689 loss=4.781, nll_loss=3.214, ppl=9.28, wps=460308, ups=1.06, wpb=434010, bsz=16790.2, num_updates=3500, lr=0.000875, gnorm=0.346, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=3401 epoch 003: 128 / 1689 loss=4.781, nll_loss=3.214, ppl=9.28, wps=460308, ups=1.06, wpb=434010, bsz=16790.2, num_updates=3500, lr=0.000875, gnorm=0.346, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=3401 epoch 003: 128 / 1689 loss=4.781, nll_loss=3.214, ppl=9.28, wps=460308, ups=1.06, wpb=434010, bsz=16790.2, num_updates=3500, lr=0.000875, gnorm=0.346, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=3401 epoch 003: 228 / 1689 loss=4.777, nll_loss=3.21, ppl=9.25, wps=455578, ups=1.05, wpb=434130, bsz=16909.3, num_updates=3600, lr=0.0009, gnorm=0.37, clip=0, loss_scale=4, train_wall=93, gb_free=20.5, wall=3497 epoch 003: 228 / 1689 loss=4.777, nll_loss=3.21, ppl=9.25, wps=455578, ups=1.05, wpb=434130, bsz=16909.3, num_updates=3600, lr=0.0009, gnorm=0.37, clip=0, loss_scale=4, train_wall=93, gb_free=20.5, wall=3497 epoch 003: 228 / 1689 loss=4.777, nll_loss=3.21, ppl=9.25, wps=455578, ups=1.05, wpb=434130, bsz=16909.3, num_updates=3600, lr=0.0009, gnorm=0.37, clip=0, loss_scale=4, train_wall=93, gb_free=20.5, wall=3497 epoch 003: 328 / 1689 loss=4.749, nll_loss=3.179, ppl=9.06, wps=456878, ups=1.06, wpb=431723, bsz=16540.2, num_updates=3700, lr=0.000925, gnorm=0.357, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=3591 epoch 003: 328 / 1689 loss=4.749, nll_loss=3.179, ppl=9.06, wps=456878, ups=1.06, wpb=431723, bsz=16540.2, num_updates=3700, lr=0.000925, gnorm=0.357, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=3591 epoch 003: 328 / 1689 loss=4.749, nll_loss=3.179, ppl=9.06, wps=456878, ups=1.06, wpb=431723, bsz=16540.2, num_updates=3700, lr=0.000925, gnorm=0.357, clip=0, loss_scale=4, train_wall=92, gb_free=18.8, wall=3591 epoch 003: 428 / 1689 loss=4.751, nll_loss=3.182, ppl=9.08, wps=460013, ups=1.06, wpb=433106, bsz=16454.4, num_updates=3800, lr=0.00095, gnorm=0.356, clip=0, loss_scale=8, train_wall=92, gb_free=20.6, wall=3685 epoch 003: 428 / 1689 loss=4.751, nll_loss=3.182, ppl=9.08, wps=460013, ups=1.06, wpb=433106, bsz=16454.4, num_updates=3800, lr=0.00095, gnorm=0.356, clip=0, loss_scale=8, train_wall=92, gb_free=20.6, wall=3685 epoch 003: 428 / 1689 loss=4.751, nll_loss=3.182, ppl=9.08, wps=460013, ups=1.06, wpb=433106, bsz=16454.4, num_updates=3800, lr=0.00095, gnorm=0.356, clip=0, loss_scale=8, train_wall=92, gb_free=20.6, wall=3685 epoch 003: 529 / 1689 loss=4.744, nll_loss=3.175, ppl=9.03, wps=451746, ups=1.04, wpb=434869, bsz=16527.2, num_updates=3900, lr=0.000975, gnorm=0.369, clip=0, loss_scale=4, train_wall=94, gb_free=18.8, wall=3782 epoch 003: 529 / 1689 loss=4.744, nll_loss=3.175, ppl=9.03, wps=451746, ups=1.04, wpb=434869, bsz=16527.2, num_updates=3900, lr=0.000975, gnorm=0.369, clip=0, loss_scale=4, train_wall=94, gb_free=18.8, wall=3782 epoch 003: 529 / 1689 loss=4.744, nll_loss=3.175, ppl=9.03, wps=451746, ups=1.04, wpb=434869, bsz=16527.2, num_updates=3900, lr=0.000975, gnorm=0.369, clip=0, loss_scale=4, train_wall=94, gb_free=18.8, wall=3782 epoch 003: 629 / 1689 loss=4.726, nll_loss=3.156, ppl=8.92, wps=458130, ups=1.06, wpb=434096, bsz=16447.2, num_updates=4000, lr=0.001, gnorm=0.357, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=3876 epoch 003: 629 / 1689 loss=4.726, nll_loss=3.156, ppl=8.92, wps=458130, ups=1.06, wpb=434096, bsz=16447.2, num_updates=4000, lr=0.001, gnorm=0.357, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=3876 epoch 003: 629 / 1689 loss=4.726, nll_loss=3.156, ppl=8.92, wps=458130, ups=1.06, wpb=434096, bsz=16447.2, num_updates=4000, lr=0.001, gnorm=0.357, clip=0, loss_scale=4, train_wall=93, gb_free=19, wall=3876 begin validation on "valid" subset epoch 003 | valid on 'valid' subset | loss 4.703 | nll_loss 3.114 | ppl 8.66 | wps 0 | wpb 42662 | bsz 2032 | num_updates 4000 | best_loss 4.703 epoch 003 | valid on 'valid' subset | loss 4.703 | nll_loss 3.114 | ppl 8.66 | wps 0 | wpb 42662 | bsz 2032 | num_updates 4000 | best_loss 4.703 epoch 003 | valid on 'valid' subset | loss 4.703 | nll_loss 3.114 | ppl 8.66 | wps 0 | wpb 42662 | bsz 2032 | num_updates 4000 | best_loss 4.703 epoch 003: 730 / 1689 loss=4.722, nll_loss=3.153, ppl=8.89, wps=380421, ups=0.88, wpb=434670, bsz=16366.6, num_updates=4100, lr=0.00098773, gnorm=0.361, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=3991 epoch 003: 730 / 1689 loss=4.722, nll_loss=3.153, ppl=8.89, wps=380421, ups=0.88, wpb=434670, bsz=16366.6, num_updates=4100, lr=0.00098773, gnorm=0.361, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=3991 epoch 003: 730 / 1689 loss=4.722, nll_loss=3.153, ppl=8.89, wps=380421, ups=0.88, wpb=434670, bsz=16366.6, num_updates=4100, lr=0.00098773, gnorm=0.361, clip=0, loss_scale=2, train_wall=93, gb_free=18.6, wall=3991 epoch 003: 830 / 1689 loss=4.693, nll_loss=3.12, ppl=8.69, wps=455857, ups=1.05, wpb=432929, bsz=16190.4, num_updates=4200, lr=0.0009759, gnorm=0.334, clip=0, loss_scale=2, train_wall=93, gb_free=21.7, wall=4086 epoch 003: 830 / 1689 loss=4.693, nll_loss=3.12, ppl=8.69, wps=455857, ups=1.05, wpb=432929, bsz=16190.4, num_updates=4200, lr=0.0009759, gnorm=0.334, clip=0, loss_scale=2, train_wall=93, gb_free=21.7, wall=4086 epoch 003: 830 / 1689 loss=4.693, nll_loss=3.12, ppl=8.69, wps=455857, ups=1.05, wpb=432929, bsz=16190.4, num_updates=4200, lr=0.0009759, gnorm=0.334, clip=0, loss_scale=2, train_wall=93, gb_free=21.7, wall=4086 epoch 003: 930 / 1689 loss=4.678, nll_loss=3.105, ppl=8.6, wps=459684, ups=1.06, wpb=433053, bsz=16397.6, num_updates=4300, lr=0.000964486, gnorm=0.352, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=4180 epoch 003: 930 / 1689 loss=4.678, nll_loss=3.105, ppl=8.6, wps=459684, ups=1.06, wpb=433053, bsz=16397.6, num_updates=4300, lr=0.000964486, gnorm=0.352, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=4180 epoch 003: 930 / 1689 loss=4.678, nll_loss=3.105, ppl=8.6, wps=459684, ups=1.06, wpb=433053, bsz=16397.6, num_updates=4300, lr=0.000964486, gnorm=0.352, clip=0, loss_scale=2, train_wall=92, gb_free=20.2, wall=4180 epoch 003: 1030 / 1689 loss=4.671, nll_loss=3.098, ppl=8.56, wps=461194, ups=1.06, wpb=434331, bsz=16308.9, num_updates=4400, lr=0.000953463, gnorm=0.327, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=4274 epoch 003: 1030 / 1689 loss=4.671, nll_loss=3.098, ppl=8.56, wps=461194, ups=1.06, wpb=434331, bsz=16308.9, num_updates=4400, lr=0.000953463, gnorm=0.327, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=4274 epoch 003: 1030 / 1689 loss=4.671, nll_loss=3.098, ppl=8.56, wps=461194, ups=1.06, wpb=434331, bsz=16308.9, num_updates=4400, lr=0.000953463, gnorm=0.327, clip=0, loss_scale=2, train_wall=92, gb_free=19, wall=4274 epoch 003: 1130 / 1689 loss=4.659, nll_loss=3.085, ppl=8.49, wps=457025, ups=1.04, wpb=437633, bsz=16972.6, num_updates=4500, lr=0.000942809, gnorm=0.331, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=4370 epoch 003: 1130 / 1689 loss=4.659, nll_loss=3.085, ppl=8.49, wps=457025, ups=1.04, wpb=437633, bsz=16972.6, num_updates=4500, lr=0.000942809, gnorm=0.331, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=4370 epoch 003: 1130 / 1689 loss=4.659, nll_loss=3.085, ppl=8.49, wps=457025, ups=1.04, wpb=437633, bsz=16972.6, num_updates=4500, lr=0.000942809, gnorm=0.331, clip=0, loss_scale=2, train_wall=94, gb_free=18.9, wall=4370 epoch 003: 1230 / 1689 loss=4.638, nll_loss=3.063, ppl=8.36, wps=460512, ups=1.06, wpb=433771, bsz=16424.7, num_updates=4600, lr=0.000932505, gnorm=0.326, clip=0, loss_scale=4, train_wall=92, gb_free=20.6, wall=4464 epoch 003: 1230 / 1689 loss=4.638, nll_loss=3.063, ppl=8.36, wps=460512, ups=1.06, wpb=433771, bsz=16424.7, num_updates=4600, lr=0.000932505, gnorm=0.326, clip=0, loss_scale=4, train_wall=92, gb_free=20.6, wall=4464 epoch 003: 1230 / 1689 loss=4.638, nll_loss=3.063, ppl=8.36, wps=460512, ups=1.06, wpb=433771, bsz=16424.7, num_updates=4600, lr=0.000932505, gnorm=0.326, clip=0, loss_scale=4, train_wall=92, gb_free=20.6, wall=4464 epoch 003: 1330 / 1689 loss=4.615, nll_loss=3.037, ppl=8.21, wps=457188, ups=1.05, wpb=433946, bsz=16368.4, num_updates=4700, lr=0.000922531, gnorm=0.318, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=4559 epoch 003: 1330 / 1689 loss=4.615, nll_loss=3.037, ppl=8.21, wps=457188, ups=1.05, wpb=433946, bsz=16368.4, num_updates=4700, lr=0.000922531, gnorm=0.318, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=4559 epoch 003: 1330 / 1689 loss=4.615, nll_loss=3.037, ppl=8.21, wps=457188, ups=1.05, wpb=433946, bsz=16368.4, num_updates=4700, lr=0.000922531, gnorm=0.318, clip=0, loss_scale=4, train_wall=93, gb_free=18.6, wall=4559 epoch 003: 1430 / 1689 loss=4.619, nll_loss=3.042, ppl=8.24, wps=458867, ups=1.06, wpb=433651, bsz=16602.7, num_updates=4800, lr=0.000912871, gnorm=0.326, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=4653 epoch 003: 1430 / 1689 loss=4.619, nll_loss=3.042, ppl=8.24, wps=458867, ups=1.06, wpb=433651, bsz=16602.7, num_updates=4800, lr=0.000912871, gnorm=0.326, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=4653 epoch 003: 1430 / 1689 loss=4.619, nll_loss=3.042, ppl=8.24, wps=458867, ups=1.06, wpb=433651, bsz=16602.7, num_updates=4800, lr=0.000912871, gnorm=0.326, clip=0, loss_scale=4, train_wall=93, gb_free=18.7, wall=4653 epoch 003: 1531 / 1689 loss=4.592, nll_loss=3.012, ppl=8.07, wps=453148, ups=1.05, wpb=432075, bsz=16461, num_updates=4900, lr=0.000903508, gnorm=0.314, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=4749 epoch 003: 1531 / 1689 loss=4.592, nll_loss=3.012, ppl=8.07, wps=453148, ups=1.05, wpb=432075, bsz=16461, num_updates=4900, lr=0.000903508, gnorm=0.314, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=4749 epoch 003: 1531 / 1689 loss=4.592, nll_loss=3.012, ppl=8.07, wps=453148, ups=1.05, wpb=432075, bsz=16461, num_updates=4900, lr=0.000903508, gnorm=0.314, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=4749 epoch 003: 1631 / 1689 loss=4.582, nll_loss=3.002, ppl=8.01, wps=453908, ups=1.05, wpb=431565, bsz=16155.2, num_updates=5000, lr=0.000894427, gnorm=0.314, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=4844 epoch 003: 1631 / 1689 loss=4.582, nll_loss=3.002, ppl=8.01, wps=453908, ups=1.05, wpb=431565, bsz=16155.2, num_updates=5000, lr=0.000894427, gnorm=0.314, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=4844 epoch 003: 1631 / 1689 loss=4.582, nll_loss=3.002, ppl=8.01, wps=453908, ups=1.05, wpb=431565, bsz=16155.2, num_updates=5000, lr=0.000894427, gnorm=0.314, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=4844 begin validation on "valid" subset epoch 003 | valid on 'valid' subset | loss 4.565 | nll_loss 2.966 | ppl 7.81 | wps 0 | wpb 42662 | bsz 2032 | num_updates 5000 | best_loss 4.565 epoch 003 | valid on 'valid' subset | loss 4.565 | nll_loss 2.966 | ppl 7.81 | wps 0 | wpb 42662 | bsz 2032 | num_updates 5000 | best_loss 4.565 epoch 003 | valid on 'valid' subset | loss 4.565 | nll_loss 2.966 | ppl 7.81 | wps 0 | wpb 42662 | bsz 2032 | num_updates 5000 | best_loss 4.565 end of epoch 3 (average epoch stats below) epoch 003 | loss 4.685 | nll_loss 3.112 | ppl 8.65 | wps 444705 | ups 1.03 | wpb 433527 | bsz 16498.3 | num_updates 5058 | lr 0.000889284 | gnorm 0.34 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 19.6 | wall 4924 epoch 003 | loss 4.685 | nll_loss 3.112 | ppl 8.65 | wps 444705 | ups 1.03 | wpb 433527 | bsz 16498.3 | num_updates 5058 | lr 0.000889284 | gnorm 0.34 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 19.6 | wall 4924 epoch 003 | loss 4.685 | nll_loss 3.112 | ppl 8.65 | wps 444705 | ups 1.03 | wpb 433527 | bsz 16498.3 | num_updates 5058 | lr 0.000889284 | gnorm 0.34 | clip 0 | loss_scale 2 | train_wall 1571 | gb_free 19.6 | wall 4924 Start iterating over samples epoch 004: 42 / 1689 loss=4.554, nll_loss=2.97, ppl=7.84, wps=353916, ups=0.82, wpb=430679, bsz=16832.2, num_updates=5100, lr=0.000885615, gnorm=0.312, clip=0, loss_scale=2, train_wall=98, gb_free=19.1, wall=4965 epoch 004: 42 / 1689 loss=4.554, nll_loss=2.97, ppl=7.84, wps=353916, ups=0.82, wpb=430679, bsz=16832.2, num_updates=5100, lr=0.000885615, gnorm=0.312, clip=0, loss_scale=2, train_wall=98, gb_free=19.1, wall=4965 epoch 004: 42 / 1689 loss=4.554, nll_loss=2.97, ppl=7.84, wps=353916, ups=0.82, wpb=430679, bsz=16832.2, num_updates=5100, lr=0.000885615, gnorm=0.312, clip=0, loss_scale=2, train_wall=98, gb_free=19.1, wall=4965 epoch 004: 42 / 1689 loss=4.554, nll_loss=2.97, ppl=7.84, wps=353916, ups=0.82, wpb=430679, bsz=16832.2, num_updates=5100, lr=0.000885615, gnorm=0.312, clip=0, loss_scale=2, train_wall=98, gb_free=19.1, wall=4965 epoch 004: 142 / 1689 loss=4.525, nll_loss=2.938, ppl=7.66, wps=456107, ups=1.05, wpb=433420, bsz=16375.6, num_updates=5200, lr=0.000877058, gnorm=0.304, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=5060 epoch 004: 142 / 1689 loss=4.525, nll_loss=2.938, ppl=7.66, wps=456107, ups=1.05, wpb=433420, bsz=16375.6, num_updates=5200, lr=0.000877058, gnorm=0.304, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=5060 epoch 004: 142 / 1689 loss=4.525, nll_loss=2.938, ppl=7.66, wps=456107, ups=1.05, wpb=433420, bsz=16375.6, num_updates=5200, lr=0.000877058, gnorm=0.304, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=5060 epoch 004: 142 / 1689 loss=4.525, nll_loss=2.938, ppl=7.66, wps=456107, ups=1.05, wpb=433420, bsz=16375.6, num_updates=5200, lr=0.000877058, gnorm=0.304, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=5060 epoch 004: 242 / 1689 loss=4.521, nll_loss=2.933, ppl=7.64, wps=454264, ups=1.05, wpb=432027, bsz=16282.2, num_updates=5300, lr=0.000868744, gnorm=0.304, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=5156 epoch 004: 242 / 1689 loss=4.521, nll_loss=2.933, ppl=7.64, wps=454264, ups=1.05, wpb=432027, bsz=16282.2, num_updates=5300, lr=0.000868744, gnorm=0.304, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=5156 epoch 004: 242 / 1689 loss=4.521, nll_loss=2.933, ppl=7.64, wps=454264, ups=1.05, wpb=432027, bsz=16282.2, num_updates=5300, lr=0.000868744, gnorm=0.304, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=5156 epoch 004: 242 / 1689 loss=4.521, nll_loss=2.933, ppl=7.64, wps=454264, ups=1.05, wpb=432027, bsz=16282.2, num_updates=5300, lr=0.000868744, gnorm=0.304, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=5156 epoch 004: 342 / 1689 loss=4.523, nll_loss=2.937, ppl=7.66, wps=460370, ups=1.06, wpb=434591, bsz=16699.2, num_updates=5400, lr=0.000860663, gnorm=0.296, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=5250 epoch 004: 342 / 1689 loss=4.523, nll_loss=2.937, ppl=7.66, wps=460370, ups=1.06, wpb=434591, bsz=16699.2, num_updates=5400, lr=0.000860663, gnorm=0.296, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=5250 epoch 004: 342 / 1689 loss=4.523, nll_loss=2.937, ppl=7.66, wps=460370, ups=1.06, wpb=434591, bsz=16699.2, num_updates=5400, lr=0.000860663, gnorm=0.296, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=5250 epoch 004: 342 / 1689 loss=4.523, nll_loss=2.937, ppl=7.66, wps=460370, ups=1.06, wpb=434591, bsz=16699.2, num_updates=5400, lr=0.000860663, gnorm=0.296, clip=0, loss_scale=4, train_wall=92, gb_free=18.6, wall=5250 epoch 004: 442 / 1689 loss=4.516, nll_loss=2.929, ppl=7.62, wps=456050, ups=1.06, wpb=432167, bsz=16708.9, num_updates=5500, lr=0.000852803, gnorm=0.297, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=5345 epoch 004: 442 / 1689 loss=4.516, nll_loss=2.929, ppl=7.62, wps=456050, ups=1.06, wpb=432167, bsz=16708.9, num_updates=5500, lr=0.000852803, gnorm=0.297, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=5345 epoch 004: 442 / 1689 loss=4.516, nll_loss=2.929, ppl=7.62, wps=456050, ups=1.06, wpb=432167, bsz=16708.9, num_updates=5500, lr=0.000852803, gnorm=0.297, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=5345 epoch 004: 442 / 1689 loss=4.516, nll_loss=2.929, ppl=7.62, wps=456050, ups=1.06, wpb=432167, bsz=16708.9, num_updates=5500, lr=0.000852803, gnorm=0.297, clip=0, loss_scale=4, train_wall=93, gb_free=18.8, wall=5345 epoch 004: 542 / 1689 loss=4.516, nll_loss=2.93, ppl=7.62, wps=459951, ups=1.06, wpb=433635, bsz=16543.4, num_updates=5600, lr=0.000845154, gnorm=0.305, clip=0, loss_scale=4, train_wall=92, gb_free=19.4, wall=5439 epoch 004: 542 / 1689 loss=4.516, nll_loss=2.93, ppl=7.62, wps=459951, ups=1.06, wpb=433635, bsz=16543.4, num_updates=5600, lr=0.000845154, gnorm=0.305, clip=0, loss_scale=4, train_wall=92, gb_free=19.4, wall=5439 epoch 004: 542 / 1689 loss=4.516, nll_loss=2.93, ppl=7.62, wps=459951, ups=1.06, wpb=433635, bsz=16543.4, num_updates=5600, lr=0.000845154, gnorm=0.305, clip=0, loss_scale=4, train_wall=92, gb_free=19.4, wall=5439 epoch 004: 542 / 1689 loss=4.516, nll_loss=2.93, ppl=7.62, wps=459951, ups=1.06, wpb=433635, bsz=16543.4, num_updates=5600, lr=0.000845154, gnorm=0.305, clip=0, loss_scale=4, train_wall=92, gb_free=19.4, wall=5439 epoch 004: 642 / 1689 loss=4.5, nll_loss=2.912, ppl=7.53, wps=459962, ups=1.06, wpb=434428, bsz=16557.4, num_updates=5700, lr=0.000837708, gnorm=0.289, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=5533 epoch 004: 642 / 1689 loss=4.5, nll_loss=2.912, ppl=7.53, wps=459962, ups=1.06, wpb=434428, bsz=16557.4, num_updates=5700, lr=0.000837708, gnorm=0.289, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=5533 epoch 004: 642 / 1689 loss=4.5, nll_loss=2.912, ppl=7.53, wps=459962, ups=1.06, wpb=434428, bsz=16557.4, num_updates=5700, lr=0.000837708, gnorm=0.289, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=5533 epoch 004: 642 / 1689 loss=4.5, nll_loss=2.912, ppl=7.53, wps=459962, ups=1.06, wpb=434428, bsz=16557.4, num_updates=5700, lr=0.000837708, gnorm=0.289, clip=0, loss_scale=4, train_wall=93, gb_free=19.2, wall=5533 epoch 004: 742 / 1689 loss=4.499, nll_loss=2.912, ppl=7.53, wps=457982, ups=1.06, wpb=433671, bsz=16575, num_updates=5800, lr=0.000830455, gnorm=0.31, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=5628 epoch 004: 742 / 1689 loss=4.499, nll_loss=2.912, ppl=7.53, wps=457982, ups=1.06, wpb=433671, bsz=16575, num_updates=5800, lr=0.000830455, gnorm=0.31, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=5628 epoch 004: 742 / 1689 loss=4.499, nll_loss=2.912, ppl=7.53, wps=457982, ups=1.06, wpb=433671, bsz=16575, num_updates=5800, lr=0.000830455, gnorm=0.31, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=5628 epoch 004: 742 / 1689 loss=4.499, nll_loss=2.912, ppl=7.53, wps=457982, ups=1.06, wpb=433671, bsz=16575, num_updates=5800, lr=0.000830455, gnorm=0.31, clip=0, loss_scale=4, train_wall=93, gb_free=19.6, wall=5628 epoch 004: 842 / 1689 loss=4.486, nll_loss=2.897, ppl=7.45, wps=458548, ups=1.05, wpb=435111, bsz=16315.5, num_updates=5900, lr=0.000823387, gnorm=0.291, clip=0, loss_scale=8, train_wall=93, gb_free=20.4, wall=5723 epoch 004: 842 / 1689 loss=4.486, nll_loss=2.897, ppl=7.45, wps=458548, ups=1.05, wpb=435111, bsz=16315.5, num_updates=5900, lr=0.000823387, gnorm=0.291, clip=0, loss_scale=8, train_wall=93, gb_free=20.4, wall=5723 epoch 004: 842 / 1689 loss=4.486, nll_loss=2.897, ppl=7.45, wps=458548, ups=1.05, wpb=435111, bsz=16315.5, num_updates=5900, lr=0.000823387, gnorm=0.291, clip=0, loss_scale=8, train_wall=93, gb_free=20.4, wall=5723 epoch 004: 842 / 1689 loss=4.486, nll_loss=2.897, ppl=7.45, wps=458548, ups=1.05, wpb=435111, bsz=16315.5, num_updates=5900, lr=0.000823387, gnorm=0.291, clip=0, loss_scale=8, train_wall=93, gb_free=20.4, wall=5723 epoch 004: 944 / 1689 loss=4.48, nll_loss=2.89, ppl=7.42, wps=446001, ups=1.03, wpb=433930, bsz=16377, num_updates=6000, lr=0.000816497, gnorm=0.29, clip=0, loss_scale=2, train_wall=96, gb_free=19, wall=5820 epoch 004: 944 / 1689 loss=4.48, nll_loss=2.89, ppl=7.42, wps=446001, ups=1.03, wpb=433930, bsz=16377, num_updates=6000, lr=0.000816497, gnorm=0.29, clip=0, loss_scale=2, train_wall=96, gb_free=19, wall=5820 epoch 004: 944 / 1689 loss=4.48, nll_loss=2.89, ppl=7.42, wps=446001, ups=1.03, wpb=433930, bsz=16377, num_updates=6000, lr=0.000816497, gnorm=0.29, clip=0, loss_scale=2, train_wall=96, gb_free=19, wall=5820 epoch 004: 944 / 1689 loss=4.48, nll_loss=2.89, ppl=7.42, wps=446001, ups=1.03, wpb=433930, bsz=16377, num_updates=6000, lr=0.000816497, gnorm=0.29, clip=0, loss_scale=2, train_wall=96, gb_free=19, wall=5820 begin validation on "valid" subset epoch 004 | valid on 'valid' subset | loss 4.501 | nll_loss 2.892 | ppl 7.42 | wps 0 | wpb 42662 | bsz 2032 | num_updates 6000 | best_loss 4.501 epoch 004 | valid on 'valid' subset | loss 4.501 | nll_loss 2.892 | ppl 7.42 | wps 0 | wpb 42662 | bsz 2032 | num_updates 6000 | best_loss 4.501 epoch 004 | valid on 'valid' subset | loss 4.501 | nll_loss 2.892 | ppl 7.42 | wps 0 | wpb 42662 | bsz 2032 | num_updates 6000 | best_loss 4.501 epoch 004 | valid on 'valid' subset | loss 4.501 | nll_loss 2.892 | ppl 7.42 | wps 0 | wpb 42662 | bsz 2032 | num_updates 6000 | best_loss 4.501 epoch 004: 1044 / 1689 loss=4.484, nll_loss=2.896, ppl=7.44, wps=157844, ups=0.36, wpb=434073, bsz=16698.2, num_updates=6100, lr=0.000809776, gnorm=0.291, clip=0, loss_scale=2, train_wall=132, gb_free=19.5, wall=6095 epoch 004: 1044 / 1689 loss=4.484, nll_loss=2.896, ppl=7.44, wps=157844, ups=0.36, wpb=434073, bsz=16698.2, num_updates=6100, lr=0.000809776, gnorm=0.291, clip=0, loss_scale=2, train_wall=132, gb_free=19.5, wall=6095 epoch 004: 1044 / 1689 loss=4.484, nll_loss=2.896, ppl=7.44, wps=157844, ups=0.36, wpb=434073, bsz=16698.2, num_updates=6100, lr=0.000809776, gnorm=0.291, clip=0, loss_scale=2, train_wall=132, gb_free=19.5, wall=6095 epoch 004: 1044 / 1689 loss=4.484, nll_loss=2.896, ppl=7.44, wps=157844, ups=0.36, wpb=434073, bsz=16698.2, num_updates=6100, lr=0.000809776, gnorm=0.291, clip=0, loss_scale=2, train_wall=132, gb_free=19.5, wall=6095 epoch 004: 1144 / 1689 loss=4.472, nll_loss=2.883, ppl=7.38, wps=462559, ups=1.06, wpb=434581, bsz=16651.3, num_updates=6200, lr=0.000803219, gnorm=0.289, clip=0, loss_scale=2, train_wall=94, gb_free=20.6, wall=6189 epoch 004: 1144 / 1689 loss=4.472, nll_loss=2.883, ppl=7.38, wps=462559, ups=1.06, wpb=434581, bsz=16651.3, num_updates=6200, lr=0.000803219, gnorm=0.289, clip=0, loss_scale=2, train_wall=94, gb_free=20.6, wall=6189 epoch 004: 1144 / 1689 loss=4.472, nll_loss=2.883, ppl=7.38, wps=462559, ups=1.06, wpb=434581, bsz=16651.3, num_updates=6200, lr=0.000803219, gnorm=0.289, clip=0, loss_scale=2, train_wall=94, gb_free=20.6, wall=6189 epoch 004: 1144 / 1689 loss=4.472, nll_loss=2.883, ppl=7.38, wps=462559, ups=1.06, wpb=434581, bsz=16651.3, num_updates=6200, lr=0.000803219, gnorm=0.289, clip=0, loss_scale=2, train_wall=94, gb_free=20.6, wall=6189 epoch 004: 1244 / 1689 loss=4.465, nll_loss=2.875, ppl=7.34, wps=464984, ups=1.07, wpb=434843, bsz=16323.7, num_updates=6300, lr=0.000796819, gnorm=0.281, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=6283 epoch 004: 1244 / 1689 loss=4.465, nll_loss=2.875, ppl=7.34, wps=464984, ups=1.07, wpb=434843, bsz=16323.7, num_updates=6300, lr=0.000796819, gnorm=0.281, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=6283 epoch 004: 1244 / 1689 loss=4.465, nll_loss=2.875, ppl=7.34, wps=464984, ups=1.07, wpb=434843, bsz=16323.7, num_updates=6300, lr=0.000796819, gnorm=0.281, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=6283 epoch 004: 1244 / 1689 loss=4.465, nll_loss=2.875, ppl=7.34, wps=464984, ups=1.07, wpb=434843, bsz=16323.7, num_updates=6300, lr=0.000796819, gnorm=0.281, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=6283 epoch 004: 1344 / 1689 loss=4.46, nll_loss=2.87, ppl=7.31, wps=462624, ups=1.07, wpb=432538, bsz=16691.5, num_updates=6400, lr=0.000790569, gnorm=0.289, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=6376 epoch 004: 1344 / 1689 loss=4.46, nll_loss=2.87, ppl=7.31, wps=462624, ups=1.07, wpb=432538, bsz=16691.5, num_updates=6400, lr=0.000790569, gnorm=0.289, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=6376 epoch 004: 1344 / 1689 loss=4.46, nll_loss=2.87, ppl=7.31, wps=462624, ups=1.07, wpb=432538, bsz=16691.5, num_updates=6400, lr=0.000790569, gnorm=0.289, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=6376 epoch 004: 1344 / 1689 loss=4.46, nll_loss=2.87, ppl=7.31, wps=462624, ups=1.07, wpb=432538, bsz=16691.5, num_updates=6400, lr=0.000790569, gnorm=0.289, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=6376 epoch 004: 1444 / 1689 loss=4.453, nll_loss=2.863, ppl=7.27, wps=461601, ups=1.06, wpb=434479, bsz=16687.9, num_updates=6500, lr=0.000784465, gnorm=0.287, clip=0, loss_scale=4, train_wall=93, gb_free=19.7, wall=6470 epoch 004: 1444 / 1689 loss=4.453, nll_loss=2.863, ppl=7.27, wps=461601, ups=1.06, wpb=434479, bsz=16687.9, num_updates=6500, lr=0.000784465, gnorm=0.287, clip=0, loss_scale=4, train_wall=93, gb_free=19.7, wall=6470 epoch 004: 1444 / 1689 loss=4.453, nll_loss=2.863, ppl=7.27, wps=461601, ups=1.06, wpb=434479, bsz=16687.9, num_updates=6500, lr=0.000784465, gnorm=0.287, clip=0, loss_scale=4, train_wall=93, gb_free=19.7, wall=6470 epoch 004: 1444 / 1689 loss=4.453, nll_loss=2.863, ppl=7.27, wps=461601, ups=1.06, wpb=434479, bsz=16687.9, num_updates=6500, lr=0.000784465, gnorm=0.287, clip=0, loss_scale=4, train_wall=93, gb_free=19.7, wall=6470 epoch 004: 1544 / 1689 loss=4.452, nll_loss=2.861, ppl=7.27, wps=462932, ups=1.07, wpb=434053, bsz=16285.8, num_updates=6600, lr=0.000778499, gnorm=0.279, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=6564 epoch 004: 1544 / 1689 loss=4.452, nll_loss=2.861, ppl=7.27, wps=462932, ups=1.07, wpb=434053, bsz=16285.8, num_updates=6600, lr=0.000778499, gnorm=0.279, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=6564 epoch 004: 1544 / 1689 loss=4.452, nll_loss=2.861, ppl=7.27, wps=462932, ups=1.07, wpb=434053, bsz=16285.8, num_updates=6600, lr=0.000778499, gnorm=0.279, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=6564 epoch 004: 1544 / 1689 loss=4.452, nll_loss=2.861, ppl=7.27, wps=462932, ups=1.07, wpb=434053, bsz=16285.8, num_updates=6600, lr=0.000778499, gnorm=0.279, clip=0, loss_scale=4, train_wall=93, gb_free=19.4, wall=6564 epoch 004: 1644 / 1689 loss=4.455, nll_loss=2.865, ppl=7.28, wps=463626, ups=1.07, wpb=432968, bsz=16215.5, num_updates=6700, lr=0.000772667, gnorm=0.289, clip=0, loss_scale=4, train_wall=92, gb_free=19.6, wall=6658 epoch 004: 1644 / 1689 loss=4.455, nll_loss=2.865, ppl=7.28, wps=463626, ups=1.07, wpb=432968, bsz=16215.5, num_updates=6700, lr=0.000772667, gnorm=0.289, clip=0, loss_scale=4, train_wall=92, gb_free=19.6, wall=6658 epoch 004: 1644 / 1689 loss=4.455, nll_loss=2.865, ppl=7.28, wps=463626, ups=1.07, wpb=432968, bsz=16215.5, num_updates=6700, lr=0.000772667, gnorm=0.289, clip=0, loss_scale=4, train_wall=92, gb_free=19.6, wall=6658 epoch 004: 1644 / 1689 loss=4.455, nll_loss=2.865, ppl=7.28, wps=463626, ups=1.07, wpb=432968, bsz=16215.5, num_updates=6700, lr=0.000772667, gnorm=0.289, clip=0, loss_scale=4, train_wall=92, gb_free=19.6, wall=6658 end of epoch 4 (average epoch stats below) epoch 004 | loss 4.488 | nll_loss 2.899 | ppl 7.46 | wps 411771 | ups 0.95 | wpb 433537 | bsz 16506.3 | num_updates 6744 | lr 0.000770143 | gnorm 0.293 | clip 0 | loss_scale 2 | train_wall 1609 | gb_free 19.6 | wall 6699 epoch 004 | loss 4.488 | nll_loss 2.899 | ppl 7.46 | wps 411771 | ups 0.95 | wpb 433537 | bsz 16506.3 | num_updates 6744 | lr 0.000770143 | gnorm 0.293 | clip 0 | loss_scale 2 | train_wall 1609 | gb_free 19.6 | wall 6699 epoch 004 | loss 4.488 | nll_loss 2.899 | ppl 7.46 | wps 411771 | ups 0.95 | wpb 433537 | bsz 16506.3 | num_updates 6744 | lr 0.000770143 | gnorm 0.293 | clip 0 | loss_scale 2 | train_wall 1609 | gb_free 19.6 | wall 6699 epoch 004 | loss 4.488 | nll_loss 2.899 | ppl 7.46 | wps 411771 | ups 0.95 | wpb 433537 | bsz 16506.3 | num_updates 6744 | lr 0.000770143 | gnorm 0.293 | clip 0 | loss_scale 2 | train_wall 1609 | gb_free 19.6 | wall 6699 Start iterating over samples epoch 005: 56 / 1689 loss=4.41, nll_loss=2.814, ppl=7.03, wps=450044, ups=1.05, wpb=429179, bsz=16285.8, num_updates=6800, lr=0.000766965, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=6753 epoch 005: 56 / 1689 loss=4.41, nll_loss=2.814, ppl=7.03, wps=450044, ups=1.05, wpb=429179, bsz=16285.8, num_updates=6800, lr=0.000766965, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=6753 epoch 005: 56 / 1689 loss=4.41, nll_loss=2.814, ppl=7.03, wps=450044, ups=1.05, wpb=429179, bsz=16285.8, num_updates=6800, lr=0.000766965, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=6753 epoch 005: 56 / 1689 loss=4.41, nll_loss=2.814, ppl=7.03, wps=450044, ups=1.05, wpb=429179, bsz=16285.8, num_updates=6800, lr=0.000766965, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=6753 epoch 005: 56 / 1689 loss=4.41, nll_loss=2.814, ppl=7.03, wps=450044, ups=1.05, wpb=429179, bsz=16285.8, num_updates=6800, lr=0.000766965, gnorm=0.28, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=6753 epoch 005: 156 / 1689 loss=4.398, nll_loss=2.8, ppl=6.97, wps=459661, ups=1.06, wpb=434468, bsz=16369.4, num_updates=6900, lr=0.000761387, gnorm=0.287, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=6847 epoch 005: 156 / 1689 loss=4.398, nll_loss=2.8, ppl=6.97, wps=459661, ups=1.06, wpb=434468, bsz=16369.4, num_updates=6900, lr=0.000761387, gnorm=0.287, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=6847 epoch 005: 156 / 1689 loss=4.398, nll_loss=2.8, ppl=6.97, wps=459661, ups=1.06, wpb=434468, bsz=16369.4, num_updates=6900, lr=0.000761387, gnorm=0.287, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=6847 epoch 005: 156 / 1689 loss=4.398, nll_loss=2.8, ppl=6.97, wps=459661, ups=1.06, wpb=434468, bsz=16369.4, num_updates=6900, lr=0.000761387, gnorm=0.287, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=6847 epoch 005: 156 / 1689 loss=4.398, nll_loss=2.8, ppl=6.97, wps=459661, ups=1.06, wpb=434468, bsz=16369.4, num_updates=6900, lr=0.000761387, gnorm=0.287, clip=0, loss_scale=2, train_wall=93, gb_free=19.2, wall=6847 epoch 005: 256 / 1689 loss=4.397, nll_loss=2.798, ppl=6.96, wps=455942, ups=1.05, wpb=432530, bsz=16314.4, num_updates=7000, lr=0.000755929, gnorm=0.29, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=6942 epoch 005: 256 / 1689 loss=4.397, nll_loss=2.798, ppl=6.96, wps=455942, ups=1.05, wpb=432530, bsz=16314.4, num_updates=7000, lr=0.000755929, gnorm=0.29, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=6942 epoch 005: 256 / 1689 loss=4.397, nll_loss=2.798, ppl=6.96, wps=455942, ups=1.05, wpb=432530, bsz=16314.4, num_updates=7000, lr=0.000755929, gnorm=0.29, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=6942 epoch 005: 256 / 1689 loss=4.397, nll_loss=2.798, ppl=6.96, wps=455942, ups=1.05, wpb=432530, bsz=16314.4, num_updates=7000, lr=0.000755929, gnorm=0.29, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=6942 epoch 005: 256 / 1689 loss=4.397, nll_loss=2.798, ppl=6.96, wps=455942, ups=1.05, wpb=432530, bsz=16314.4, num_updates=7000, lr=0.000755929, gnorm=0.29, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=6942 begin validation on "valid" subset epoch 005 | valid on 'valid' subset | loss 4.461 | nll_loss 2.85 | ppl 7.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 7000 | best_loss 4.461 epoch 005 | valid on 'valid' subset | loss 4.461 | nll_loss 2.85 | ppl 7.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 7000 | best_loss 4.461 epoch 005 | valid on 'valid' subset | loss 4.461 | nll_loss 2.85 | ppl 7.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 7000 | best_loss 4.461 epoch 005 | valid on 'valid' subset | loss 4.461 | nll_loss 2.85 | ppl 7.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 7000 | best_loss 4.461 epoch 005 | valid on 'valid' subset | loss 4.461 | nll_loss 2.85 | ppl 7.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 7000 | best_loss 4.461 epoch 005: 356 / 1689 loss=4.388, nll_loss=2.789, ppl=6.91, wps=378492, ups=0.87, wpb=434470, bsz=16713.8, num_updates=7100, lr=0.000750587, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=21.5, wall=7057 epoch 005: 356 / 1689 loss=4.388, nll_loss=2.789, ppl=6.91, wps=378492, ups=0.87, wpb=434470, bsz=16713.8, num_updates=7100, lr=0.000750587, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=21.5, wall=7057 epoch 005: 356 / 1689 loss=4.388, nll_loss=2.789, ppl=6.91, wps=378492, ups=0.87, wpb=434470, bsz=16713.8, num_updates=7100, lr=0.000750587, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=21.5, wall=7057 epoch 005: 356 / 1689 loss=4.388, nll_loss=2.789, ppl=6.91, wps=378492, ups=0.87, wpb=434470, bsz=16713.8, num_updates=7100, lr=0.000750587, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=21.5, wall=7057 epoch 005: 356 / 1689 loss=4.388, nll_loss=2.789, ppl=6.91, wps=378492, ups=0.87, wpb=434470, bsz=16713.8, num_updates=7100, lr=0.000750587, gnorm=0.27, clip=0, loss_scale=2, train_wall=93, gb_free=21.5, wall=7057 epoch 005: 456 / 1689 loss=4.397, nll_loss=2.8, ppl=6.97, wps=456142, ups=1.05, wpb=433003, bsz=16603, num_updates=7200, lr=0.000745356, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=7152 epoch 005: 456 / 1689 loss=4.397, nll_loss=2.8, ppl=6.97, wps=456142, ups=1.05, wpb=433003, bsz=16603, num_updates=7200, lr=0.000745356, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=7152 epoch 005: 456 / 1689 loss=4.397, nll_loss=2.8, ppl=6.97, wps=456142, ups=1.05, wpb=433003, bsz=16603, num_updates=7200, lr=0.000745356, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=7152 epoch 005: 456 / 1689 loss=4.397, nll_loss=2.8, ppl=6.97, wps=456142, ups=1.05, wpb=433003, bsz=16603, num_updates=7200, lr=0.000745356, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=7152 epoch 005: 456 / 1689 loss=4.397, nll_loss=2.8, ppl=6.97, wps=456142, ups=1.05, wpb=433003, bsz=16603, num_updates=7200, lr=0.000745356, gnorm=0.262, clip=0, loss_scale=2, train_wall=93, gb_free=18.2, wall=7152 epoch 005: 556 / 1689 loss=4.386, nll_loss=2.788, ppl=6.91, wps=457760, ups=1.05, wpb=434026, bsz=16606.7, num_updates=7300, lr=0.000740233, gnorm=0.277, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=7247 epoch 005: 556 / 1689 loss=4.386, nll_loss=2.788, ppl=6.91, wps=457760, ups=1.05, wpb=434026, bsz=16606.7, num_updates=7300, lr=0.000740233, gnorm=0.277, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=7247 epoch 005: 556 / 1689 loss=4.386, nll_loss=2.788, ppl=6.91, wps=457760, ups=1.05, wpb=434026, bsz=16606.7, num_updates=7300, lr=0.000740233, gnorm=0.277, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=7247 epoch 005: 556 / 1689 loss=4.386, nll_loss=2.788, ppl=6.91, wps=457760, ups=1.05, wpb=434026, bsz=16606.7, num_updates=7300, lr=0.000740233, gnorm=0.277, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=7247 epoch 005: 556 / 1689 loss=4.386, nll_loss=2.788, ppl=6.91, wps=457760, ups=1.05, wpb=434026, bsz=16606.7, num_updates=7300, lr=0.000740233, gnorm=0.277, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=7247 epoch 005: 657 / 1689 loss=4.393, nll_loss=2.796, ppl=6.95, wps=455478, ups=1.05, wpb=433342, bsz=16440.4, num_updates=7400, lr=0.000735215, gnorm=0.269, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=7342 epoch 005: 657 / 1689 loss=4.393, nll_loss=2.796, ppl=6.95, wps=455478, ups=1.05, wpb=433342, bsz=16440.4, num_updates=7400, lr=0.000735215, gnorm=0.269, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=7342 epoch 005: 657 / 1689 loss=4.393, nll_loss=2.796, ppl=6.95, wps=455478, ups=1.05, wpb=433342, bsz=16440.4, num_updates=7400, lr=0.000735215, gnorm=0.269, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=7342 epoch 005: 657 / 1689 loss=4.393, nll_loss=2.796, ppl=6.95, wps=455478, ups=1.05, wpb=433342, bsz=16440.4, num_updates=7400, lr=0.000735215, gnorm=0.269, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=7342 epoch 005: 657 / 1689 loss=4.393, nll_loss=2.796, ppl=6.95, wps=455478, ups=1.05, wpb=433342, bsz=16440.4, num_updates=7400, lr=0.000735215, gnorm=0.269, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=7342 epoch 005: 757 / 1689 loss=4.385, nll_loss=2.788, ppl=6.91, wps=460016, ups=1.07, wpb=431303, bsz=16399.1, num_updates=7500, lr=0.000730297, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=7436 epoch 005: 757 / 1689 loss=4.385, nll_loss=2.788, ppl=6.91, wps=460016, ups=1.07, wpb=431303, bsz=16399.1, num_updates=7500, lr=0.000730297, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=7436 epoch 005: 757 / 1689 loss=4.385, nll_loss=2.788, ppl=6.91, wps=460016, ups=1.07, wpb=431303, bsz=16399.1, num_updates=7500, lr=0.000730297, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=7436 epoch 005: 757 / 1689 loss=4.385, nll_loss=2.788, ppl=6.91, wps=460016, ups=1.07, wpb=431303, bsz=16399.1, num_updates=7500, lr=0.000730297, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=7436 epoch 005: 757 / 1689 loss=4.385, nll_loss=2.788, ppl=6.91, wps=460016, ups=1.07, wpb=431303, bsz=16399.1, num_updates=7500, lr=0.000730297, gnorm=0.262, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=7436 epoch 005: 857 / 1689 loss=4.382, nll_loss=2.785, ppl=6.89, wps=458937, ups=1.05, wpb=435115, bsz=16536.1, num_updates=7600, lr=0.000725476, gnorm=0.267, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=7531 epoch 005: 857 / 1689 loss=4.382, nll_loss=2.785, ppl=6.89, wps=458937, ups=1.05, wpb=435115, bsz=16536.1, num_updates=7600, lr=0.000725476, gnorm=0.267, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=7531 epoch 005: 857 / 1689 loss=4.382, nll_loss=2.785, ppl=6.89, wps=458937, ups=1.05, wpb=435115, bsz=16536.1, num_updates=7600, lr=0.000725476, gnorm=0.267, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=7531 epoch 005: 857 / 1689 loss=4.382, nll_loss=2.785, ppl=6.89, wps=458937, ups=1.05, wpb=435115, bsz=16536.1, num_updates=7600, lr=0.000725476, gnorm=0.267, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=7531 epoch 005: 857 / 1689 loss=4.382, nll_loss=2.785, ppl=6.89, wps=458937, ups=1.05, wpb=435115, bsz=16536.1, num_updates=7600, lr=0.000725476, gnorm=0.267, clip=0, loss_scale=2, train_wall=93, gb_free=18.1, wall=7531 epoch 005: 957 / 1689 loss=4.386, nll_loss=2.789, ppl=6.91, wps=461871, ups=1.06, wpb=435414, bsz=16477.5, num_updates=7700, lr=0.00072075, gnorm=0.275, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=7625 epoch 005: 957 / 1689 loss=4.386, nll_loss=2.789, ppl=6.91, wps=461871, ups=1.06, wpb=435414, bsz=16477.5, num_updates=7700, lr=0.00072075, gnorm=0.275, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=7625 epoch 005: 957 / 1689 loss=4.386, nll_loss=2.789, ppl=6.91, wps=461871, ups=1.06, wpb=435414, bsz=16477.5, num_updates=7700, lr=0.00072075, gnorm=0.275, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=7625 epoch 005: 957 / 1689 loss=4.386, nll_loss=2.789, ppl=6.91, wps=461871, ups=1.06, wpb=435414, bsz=16477.5, num_updates=7700, lr=0.00072075, gnorm=0.275, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=7625 epoch 005: 957 / 1689 loss=4.386, nll_loss=2.789, ppl=6.91, wps=461871, ups=1.06, wpb=435414, bsz=16477.5, num_updates=7700, lr=0.00072075, gnorm=0.275, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=7625 epoch 005: 1057 / 1689 loss=4.368, nll_loss=2.77, ppl=6.82, wps=459441, ups=1.06, wpb=434233, bsz=16651.7, num_updates=7800, lr=0.000716115, gnorm=0.265, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=7719 epoch 005: 1057 / 1689 loss=4.368, nll_loss=2.77, ppl=6.82, wps=459441, ups=1.06, wpb=434233, bsz=16651.7, num_updates=7800, lr=0.000716115, gnorm=0.265, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=7719 epoch 005: 1057 / 1689 loss=4.368, nll_loss=2.77, ppl=6.82, wps=459441, ups=1.06, wpb=434233, bsz=16651.7, num_updates=7800, lr=0.000716115, gnorm=0.265, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=7719 epoch 005: 1057 / 1689 loss=4.368, nll_loss=2.77, ppl=6.82, wps=459441, ups=1.06, wpb=434233, bsz=16651.7, num_updates=7800, lr=0.000716115, gnorm=0.265, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=7719 epoch 005: 1057 / 1689 loss=4.368, nll_loss=2.77, ppl=6.82, wps=459441, ups=1.06, wpb=434233, bsz=16651.7, num_updates=7800, lr=0.000716115, gnorm=0.265, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=7719 epoch 005: 1158 / 1689 loss=4.37, nll_loss=2.772, ppl=6.83, wps=453410, ups=1.04, wpb=435127, bsz=16680.6, num_updates=7900, lr=0.000711568, gnorm=0.259, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=7815 epoch 005: 1158 / 1689 loss=4.37, nll_loss=2.772, ppl=6.83, wps=453410, ups=1.04, wpb=435127, bsz=16680.6, num_updates=7900, lr=0.000711568, gnorm=0.259, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=7815 epoch 005: 1158 / 1689 loss=4.37, nll_loss=2.772, ppl=6.83, wps=453410, ups=1.04, wpb=435127, bsz=16680.6, num_updates=7900, lr=0.000711568, gnorm=0.259, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=7815 epoch 005: 1158 / 1689 loss=4.37, nll_loss=2.772, ppl=6.83, wps=453410, ups=1.04, wpb=435127, bsz=16680.6, num_updates=7900, lr=0.000711568, gnorm=0.259, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=7815 epoch 005: 1158 / 1689 loss=4.37, nll_loss=2.772, ppl=6.83, wps=453410, ups=1.04, wpb=435127, bsz=16680.6, num_updates=7900, lr=0.000711568, gnorm=0.259, clip=0, loss_scale=2, train_wall=94, gb_free=19.1, wall=7815 epoch 005: 1260 / 1689 loss=4.377, nll_loss=2.779, ppl=6.87, wps=455090, ups=1.05, wpb=434627, bsz=16404.4, num_updates=8000, lr=0.000707107, gnorm=0.285, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.4, wall=7911 epoch 005: 1260 / 1689 loss=4.377, nll_loss=2.779, ppl=6.87, wps=455090, ups=1.05, wpb=434627, bsz=16404.4, num_updates=8000, lr=0.000707107, gnorm=0.285, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.4, wall=7911 epoch 005: 1260 / 1689 loss=4.377, nll_loss=2.779, ppl=6.87, wps=455090, ups=1.05, wpb=434627, bsz=16404.4, num_updates=8000, lr=0.000707107, gnorm=0.285, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.4, wall=7911 epoch 005: 1260 / 1689 loss=4.377, nll_loss=2.779, ppl=6.87, wps=455090, ups=1.05, wpb=434627, bsz=16404.4, num_updates=8000, lr=0.000707107, gnorm=0.285, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.4, wall=7911 epoch 005: 1260 / 1689 loss=4.377, nll_loss=2.779, ppl=6.87, wps=455090, ups=1.05, wpb=434627, bsz=16404.4, num_updates=8000, lr=0.000707107, gnorm=0.285, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.4, wall=7911 begin validation on "valid" subset epoch 005 | valid on 'valid' subset | loss 4.414 | nll_loss 2.803 | ppl 6.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 8000 | best_loss 4.414 epoch 005 | valid on 'valid' subset | loss 4.414 | nll_loss 2.803 | ppl 6.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 8000 | best_loss 4.414 epoch 005 | valid on 'valid' subset | loss 4.414 | nll_loss 2.803 | ppl 6.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 8000 | best_loss 4.414 epoch 005 | valid on 'valid' subset | loss 4.414 | nll_loss 2.803 | ppl 6.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 8000 | best_loss 4.414 epoch 005 | valid on 'valid' subset | loss 4.414 | nll_loss 2.803 | ppl 6.98 | wps 0 | wpb 42662 | bsz 2032 | num_updates 8000 | best_loss 4.414 epoch 005: 1360 / 1689 loss=4.374, nll_loss=2.776, ppl=6.85, wps=376929, ups=0.87, wpb=434750, bsz=16767.7, num_updates=8100, lr=0.000702728, gnorm=0.33, clip=5, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=8026 epoch 005: 1360 / 1689 loss=4.374, nll_loss=2.776, ppl=6.85, wps=376929, ups=0.87, wpb=434750, bsz=16767.7, num_updates=8100, lr=0.000702728, gnorm=0.33, clip=5, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=8026 epoch 005: 1360 / 1689 loss=4.374, nll_loss=2.776, ppl=6.85, wps=376929, ups=0.87, wpb=434750, bsz=16767.7, num_updates=8100, lr=0.000702728, gnorm=0.33, clip=5, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=8026 epoch 005: 1360 / 1689 loss=4.374, nll_loss=2.776, ppl=6.85, wps=376929, ups=0.87, wpb=434750, bsz=16767.7, num_updates=8100, lr=0.000702728, gnorm=0.33, clip=5, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=8026 epoch 005: 1360 / 1689 loss=4.374, nll_loss=2.776, ppl=6.85, wps=376929, ups=0.87, wpb=434750, bsz=16767.7, num_updates=8100, lr=0.000702728, gnorm=0.33, clip=5, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=8026 epoch 005: 1460 / 1689 loss=4.366, nll_loss=2.768, ppl=6.81, wps=462631, ups=1.07, wpb=432718, bsz=16404.7, num_updates=8200, lr=0.00069843, gnorm=0.28, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=8120 epoch 005: 1460 / 1689 loss=4.366, nll_loss=2.768, ppl=6.81, wps=462631, ups=1.07, wpb=432718, bsz=16404.7, num_updates=8200, lr=0.00069843, gnorm=0.28, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=8120 epoch 005: 1460 / 1689 loss=4.366, nll_loss=2.768, ppl=6.81, wps=462631, ups=1.07, wpb=432718, bsz=16404.7, num_updates=8200, lr=0.00069843, gnorm=0.28, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=8120 epoch 005: 1460 / 1689 loss=4.366, nll_loss=2.768, ppl=6.81, wps=462631, ups=1.07, wpb=432718, bsz=16404.7, num_updates=8200, lr=0.00069843, gnorm=0.28, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=8120 epoch 005: 1460 / 1689 loss=4.366, nll_loss=2.768, ppl=6.81, wps=462631, ups=1.07, wpb=432718, bsz=16404.7, num_updates=8200, lr=0.00069843, gnorm=0.28, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=8120 epoch 005: 1561 / 1689 loss=4.357, nll_loss=2.758, ppl=6.76, wps=456977, ups=1.06, wpb=432205, bsz=16160, num_updates=8300, lr=0.00069421, gnorm=0.263, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.1, wall=8214 epoch 005: 1561 / 1689 loss=4.357, nll_loss=2.758, ppl=6.76, wps=456977, ups=1.06, wpb=432205, bsz=16160, num_updates=8300, lr=0.00069421, gnorm=0.263, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.1, wall=8214 epoch 005: 1561 / 1689 loss=4.357, nll_loss=2.758, ppl=6.76, wps=456977, ups=1.06, wpb=432205, bsz=16160, num_updates=8300, lr=0.00069421, gnorm=0.263, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.1, wall=8214 epoch 005: 1561 / 1689 loss=4.357, nll_loss=2.758, ppl=6.76, wps=456977, ups=1.06, wpb=432205, bsz=16160, num_updates=8300, lr=0.00069421, gnorm=0.263, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.1, wall=8214 epoch 005: 1561 / 1689 loss=4.357, nll_loss=2.758, ppl=6.76, wps=456977, ups=1.06, wpb=432205, bsz=16160, num_updates=8300, lr=0.00069421, gnorm=0.263, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.1, wall=8214 epoch 005: 1661 / 1689 loss=4.365, nll_loss=2.768, ppl=6.81, wps=460552, ups=1.06, wpb=433230, bsz=16866.2, num_updates=8400, lr=0.000690066, gnorm=0.26, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=8308 epoch 005: 1661 / 1689 loss=4.365, nll_loss=2.768, ppl=6.81, wps=460552, ups=1.06, wpb=433230, bsz=16866.2, num_updates=8400, lr=0.000690066, gnorm=0.26, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=8308 epoch 005: 1661 / 1689 loss=4.365, nll_loss=2.768, ppl=6.81, wps=460552, ups=1.06, wpb=433230, bsz=16866.2, num_updates=8400, lr=0.000690066, gnorm=0.26, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=8308 epoch 005: 1661 / 1689 loss=4.365, nll_loss=2.768, ppl=6.81, wps=460552, ups=1.06, wpb=433230, bsz=16866.2, num_updates=8400, lr=0.000690066, gnorm=0.26, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=8308 epoch 005: 1661 / 1689 loss=4.365, nll_loss=2.768, ppl=6.81, wps=460552, ups=1.06, wpb=433230, bsz=16866.2, num_updates=8400, lr=0.000690066, gnorm=0.26, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=8308 end of epoch 5 (average epoch stats below) epoch 005 | loss 4.381 | nll_loss 2.783 | ppl 6.88 | wps 446700 | ups 1.03 | wpb 433535 | bsz 16506.4 | num_updates 8428 | lr 0.000688918 | gnorm 0.275 | clip 0.3 | loss_scale 0.25 | train_wall 1567 | gb_free 23.9 | wall 8334 epoch 005 | loss 4.381 | nll_loss 2.783 | ppl 6.88 | wps 446700 | ups 1.03 | wpb 433535 | bsz 16506.4 | num_updates 8428 | lr 0.000688918 | gnorm 0.275 | clip 0.3 | loss_scale 0.25 | train_wall 1567 | gb_free 23.9 | wall 8334 epoch 005 | loss 4.381 | nll_loss 2.783 | ppl 6.88 | wps 446700 | ups 1.03 | wpb 433535 | bsz 16506.4 | num_updates 8428 | lr 0.000688918 | gnorm 0.275 | clip 0.3 | loss_scale 0.25 | train_wall 1567 | gb_free 23.9 | wall 8334 epoch 005 | loss 4.381 | nll_loss 2.783 | ppl 6.88 | wps 446700 | ups 1.03 | wpb 433535 | bsz 16506.4 | num_updates 8428 | lr 0.000688918 | gnorm 0.275 | clip 0.3 | loss_scale 0.25 | train_wall 1567 | gb_free 23.9 | wall 8334 epoch 005 | loss 4.381 | nll_loss 2.783 | ppl 6.88 | wps 446700 | ups 1.03 | wpb 433535 | bsz 16506.4 | num_updates 8428 | lr 0.000688918 | gnorm 0.275 | clip 0.3 | loss_scale 0.25 | train_wall 1567 | gb_free 23.9 | wall 8334 Start iterating over samples epoch 006: 72 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=456838, ups=1.06, wpb=429967, bsz=16095.3, num_updates=8500, lr=0.000685994, gnorm=0.265, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=8403 epoch 006: 72 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=456838, ups=1.06, wpb=429967, bsz=16095.3, num_updates=8500, lr=0.000685994, gnorm=0.265, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=8403 epoch 006: 72 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=456838, ups=1.06, wpb=429967, bsz=16095.3, num_updates=8500, lr=0.000685994, gnorm=0.265, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=8403 epoch 006: 72 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=456838, ups=1.06, wpb=429967, bsz=16095.3, num_updates=8500, lr=0.000685994, gnorm=0.265, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=8403 epoch 006: 72 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=456838, ups=1.06, wpb=429967, bsz=16095.3, num_updates=8500, lr=0.000685994, gnorm=0.265, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=8403 epoch 006: 72 / 1689 loss=4.314, nll_loss=2.708, ppl=6.53, wps=456838, ups=1.06, wpb=429967, bsz=16095.3, num_updates=8500, lr=0.000685994, gnorm=0.265, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=8403 epoch 006: 172 / 1689 loss=4.322, nll_loss=2.717, ppl=6.58, wps=457112, ups=1.05, wpb=434926, bsz=16380.4, num_updates=8600, lr=0.000681994, gnorm=0.258, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.7, wall=8498 epoch 006: 172 / 1689 loss=4.322, nll_loss=2.717, ppl=6.58, wps=457112, ups=1.05, wpb=434926, bsz=16380.4, num_updates=8600, lr=0.000681994, gnorm=0.258, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.7, wall=8498 epoch 006: 172 / 1689 loss=4.322, nll_loss=2.717, ppl=6.58, wps=457112, ups=1.05, wpb=434926, bsz=16380.4, num_updates=8600, lr=0.000681994, gnorm=0.258, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.7, wall=8498 epoch 006: 172 / 1689 loss=4.322, nll_loss=2.717, ppl=6.58, wps=457112, ups=1.05, wpb=434926, bsz=16380.4, num_updates=8600, lr=0.000681994, gnorm=0.258, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.7, wall=8498 epoch 006: 172 / 1689 loss=4.322, nll_loss=2.717, ppl=6.58, wps=457112, ups=1.05, wpb=434926, bsz=16380.4, num_updates=8600, lr=0.000681994, gnorm=0.258, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.7, wall=8498 epoch 006: 172 / 1689 loss=4.322, nll_loss=2.717, ppl=6.58, wps=457112, ups=1.05, wpb=434926, bsz=16380.4, num_updates=8600, lr=0.000681994, gnorm=0.258, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.7, wall=8498 epoch 006: 272 / 1689 loss=4.304, nll_loss=2.697, ppl=6.48, wps=457200, ups=1.06, wpb=432741, bsz=16590.4, num_updates=8700, lr=0.000678064, gnorm=0.252, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=8592 epoch 006: 272 / 1689 loss=4.304, nll_loss=2.697, ppl=6.48, wps=457200, ups=1.06, wpb=432741, bsz=16590.4, num_updates=8700, lr=0.000678064, gnorm=0.252, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=8592 epoch 006: 272 / 1689 loss=4.304, nll_loss=2.697, ppl=6.48, wps=457200, ups=1.06, wpb=432741, bsz=16590.4, num_updates=8700, lr=0.000678064, gnorm=0.252, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=8592 epoch 006: 272 / 1689 loss=4.304, nll_loss=2.697, ppl=6.48, wps=457200, ups=1.06, wpb=432741, bsz=16590.4, num_updates=8700, lr=0.000678064, gnorm=0.252, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=8592 epoch 006: 272 / 1689 loss=4.304, nll_loss=2.697, ppl=6.48, wps=457200, ups=1.06, wpb=432741, bsz=16590.4, num_updates=8700, lr=0.000678064, gnorm=0.252, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=8592 epoch 006: 272 / 1689 loss=4.304, nll_loss=2.697, ppl=6.48, wps=457200, ups=1.06, wpb=432741, bsz=16590.4, num_updates=8700, lr=0.000678064, gnorm=0.252, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=8592 epoch 006: 372 / 1689 loss=4.32, nll_loss=2.716, ppl=6.57, wps=457701, ups=1.06, wpb=433774, bsz=16354.2, num_updates=8800, lr=0.0006742, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.6, wall=8687 epoch 006: 372 / 1689 loss=4.32, nll_loss=2.716, ppl=6.57, wps=457701, ups=1.06, wpb=433774, bsz=16354.2, num_updates=8800, lr=0.0006742, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.6, wall=8687 epoch 006: 372 / 1689 loss=4.32, nll_loss=2.716, ppl=6.57, wps=457701, ups=1.06, wpb=433774, bsz=16354.2, num_updates=8800, lr=0.0006742, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.6, wall=8687 epoch 006: 372 / 1689 loss=4.32, nll_loss=2.716, ppl=6.57, wps=457701, ups=1.06, wpb=433774, bsz=16354.2, num_updates=8800, lr=0.0006742, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.6, wall=8687 epoch 006: 372 / 1689 loss=4.32, nll_loss=2.716, ppl=6.57, wps=457701, ups=1.06, wpb=433774, bsz=16354.2, num_updates=8800, lr=0.0006742, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.6, wall=8687 epoch 006: 372 / 1689 loss=4.32, nll_loss=2.716, ppl=6.57, wps=457701, ups=1.06, wpb=433774, bsz=16354.2, num_updates=8800, lr=0.0006742, gnorm=0.25, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.6, wall=8687 epoch 006: 472 / 1689 loss=4.316, nll_loss=2.711, ppl=6.55, wps=456278, ups=1.06, wpb=432334, bsz=16377.8, num_updates=8900, lr=0.000670402, gnorm=0.282, clip=1, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=8782 epoch 006: 472 / 1689 loss=4.316, nll_loss=2.711, ppl=6.55, wps=456278, ups=1.06, wpb=432334, bsz=16377.8, num_updates=8900, lr=0.000670402, gnorm=0.282, clip=1, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=8782 epoch 006: 472 / 1689 loss=4.316, nll_loss=2.711, ppl=6.55, wps=456278, ups=1.06, wpb=432334, bsz=16377.8, num_updates=8900, lr=0.000670402, gnorm=0.282, clip=1, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=8782 epoch 006: 472 / 1689 loss=4.316, nll_loss=2.711, ppl=6.55, wps=456278, ups=1.06, wpb=432334, bsz=16377.8, num_updates=8900, lr=0.000670402, gnorm=0.282, clip=1, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=8782 epoch 006: 472 / 1689 loss=4.316, nll_loss=2.711, ppl=6.55, wps=456278, ups=1.06, wpb=432334, bsz=16377.8, num_updates=8900, lr=0.000670402, gnorm=0.282, clip=1, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=8782 epoch 006: 472 / 1689 loss=4.316, nll_loss=2.711, ppl=6.55, wps=456278, ups=1.06, wpb=432334, bsz=16377.8, num_updates=8900, lr=0.000670402, gnorm=0.282, clip=1, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=8782 epoch 006: 572 / 1689 loss=4.312, nll_loss=2.707, ppl=6.53, wps=455690, ups=1.05, wpb=434698, bsz=16682.6, num_updates=9000, lr=0.000666667, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=8877 epoch 006: 572 / 1689 loss=4.312, nll_loss=2.707, ppl=6.53, wps=455690, ups=1.05, wpb=434698, bsz=16682.6, num_updates=9000, lr=0.000666667, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=8877 epoch 006: 572 / 1689 loss=4.312, nll_loss=2.707, ppl=6.53, wps=455690, ups=1.05, wpb=434698, bsz=16682.6, num_updates=9000, lr=0.000666667, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=8877 epoch 006: 572 / 1689 loss=4.312, nll_loss=2.707, ppl=6.53, wps=455690, ups=1.05, wpb=434698, bsz=16682.6, num_updates=9000, lr=0.000666667, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=8877 epoch 006: 572 / 1689 loss=4.312, nll_loss=2.707, ppl=6.53, wps=455690, ups=1.05, wpb=434698, bsz=16682.6, num_updates=9000, lr=0.000666667, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=8877 epoch 006: 572 / 1689 loss=4.312, nll_loss=2.707, ppl=6.53, wps=455690, ups=1.05, wpb=434698, bsz=16682.6, num_updates=9000, lr=0.000666667, gnorm=0.266, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=8877 begin validation on "valid" subset epoch 006 | valid on 'valid' subset | loss 4.402 | nll_loss 2.789 | ppl 6.91 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.402 epoch 006 | valid on 'valid' subset | loss 4.402 | nll_loss 2.789 | ppl 6.91 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.402 epoch 006 | valid on 'valid' subset | loss 4.402 | nll_loss 2.789 | ppl 6.91 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.402 epoch 006 | valid on 'valid' subset | loss 4.402 | nll_loss 2.789 | ppl 6.91 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.402 epoch 006 | valid on 'valid' subset | loss 4.402 | nll_loss 2.789 | ppl 6.91 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.402 epoch 006 | valid on 'valid' subset | loss 4.402 | nll_loss 2.789 | ppl 6.91 | wps 0 | wpb 42662 | bsz 2032 | num_updates 9000 | best_loss 4.402 epoch 006: 672 / 1689 loss=4.314, nll_loss=2.71, ppl=6.54, wps=295927, ups=0.68, wpb=432371, bsz=16859.5, num_updates=9100, lr=0.000662994, gnorm=0.259, clip=1, loss_scale=0.5, train_wall=122, gb_free=19.8, wall=9023 epoch 006: 672 / 1689 loss=4.314, nll_loss=2.71, ppl=6.54, wps=295927, ups=0.68, wpb=432371, bsz=16859.5, num_updates=9100, lr=0.000662994, gnorm=0.259, clip=1, loss_scale=0.5, train_wall=122, gb_free=19.8, wall=9023 epoch 006: 672 / 1689 loss=4.314, nll_loss=2.71, ppl=6.54, wps=295927, ups=0.68, wpb=432371, bsz=16859.5, num_updates=9100, lr=0.000662994, gnorm=0.259, clip=1, loss_scale=0.5, train_wall=122, gb_free=19.8, wall=9023 epoch 006: 672 / 1689 loss=4.314, nll_loss=2.71, ppl=6.54, wps=295927, ups=0.68, wpb=432371, bsz=16859.5, num_updates=9100, lr=0.000662994, gnorm=0.259, clip=1, loss_scale=0.5, train_wall=122, gb_free=19.8, wall=9023 epoch 006: 672 / 1689 loss=4.314, nll_loss=2.71, ppl=6.54, wps=295927, ups=0.68, wpb=432371, bsz=16859.5, num_updates=9100, lr=0.000662994, gnorm=0.259, clip=1, loss_scale=0.5, train_wall=122, gb_free=19.8, wall=9023 epoch 006: 672 / 1689 loss=4.314, nll_loss=2.71, ppl=6.54, wps=295927, ups=0.68, wpb=432371, bsz=16859.5, num_updates=9100, lr=0.000662994, gnorm=0.259, clip=1, loss_scale=0.5, train_wall=122, gb_free=19.8, wall=9023 epoch 006: 772 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=455340, ups=1.05, wpb=433253, bsz=16423, num_updates=9200, lr=0.00065938, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=9119 epoch 006: 772 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=455340, ups=1.05, wpb=433253, bsz=16423, num_updates=9200, lr=0.00065938, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=9119 epoch 006: 772 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=455340, ups=1.05, wpb=433253, bsz=16423, num_updates=9200, lr=0.00065938, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=9119 epoch 006: 772 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=455340, ups=1.05, wpb=433253, bsz=16423, num_updates=9200, lr=0.00065938, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=9119 epoch 006: 772 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=455340, ups=1.05, wpb=433253, bsz=16423, num_updates=9200, lr=0.00065938, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=9119 epoch 006: 772 / 1689 loss=4.311, nll_loss=2.706, ppl=6.52, wps=455340, ups=1.05, wpb=433253, bsz=16423, num_updates=9200, lr=0.00065938, gnorm=0.257, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=9119 epoch 006: 872 / 1689 loss=4.326, nll_loss=2.724, ppl=6.61, wps=458748, ups=1.06, wpb=433967, bsz=16780.4, num_updates=9300, lr=0.000655826, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=9213 epoch 006: 872 / 1689 loss=4.326, nll_loss=2.724, ppl=6.61, wps=458748, ups=1.06, wpb=433967, bsz=16780.4, num_updates=9300, lr=0.000655826, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=9213 epoch 006: 872 / 1689 loss=4.326, nll_loss=2.724, ppl=6.61, wps=458748, ups=1.06, wpb=433967, bsz=16780.4, num_updates=9300, lr=0.000655826, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=9213 epoch 006: 872 / 1689 loss=4.326, nll_loss=2.724, ppl=6.61, wps=458748, ups=1.06, wpb=433967, bsz=16780.4, num_updates=9300, lr=0.000655826, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=9213 epoch 006: 872 / 1689 loss=4.326, nll_loss=2.724, ppl=6.61, wps=458748, ups=1.06, wpb=433967, bsz=16780.4, num_updates=9300, lr=0.000655826, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=9213 epoch 006: 872 / 1689 loss=4.326, nll_loss=2.724, ppl=6.61, wps=458748, ups=1.06, wpb=433967, bsz=16780.4, num_updates=9300, lr=0.000655826, gnorm=0.253, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=9213 epoch 006: 972 / 1689 loss=4.314, nll_loss=2.71, ppl=6.54, wps=459984, ups=1.06, wpb=434407, bsz=16331.1, num_updates=9400, lr=0.000652328, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=9308 epoch 006: 972 / 1689 loss=4.314, nll_loss=2.71, ppl=6.54, wps=459984, ups=1.06, wpb=434407, bsz=16331.1, num_updates=9400, lr=0.000652328, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=9308 epoch 006: 972 / 1689 loss=4.314, nll_loss=2.71, ppl=6.54, wps=459984, ups=1.06, wpb=434407, bsz=16331.1, num_updates=9400, lr=0.000652328, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=9308 epoch 006: 972 / 1689 loss=4.314, nll_loss=2.71, ppl=6.54, wps=459984, ups=1.06, wpb=434407, bsz=16331.1, num_updates=9400, lr=0.000652328, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=9308 epoch 006: 972 / 1689 loss=4.314, nll_loss=2.71, ppl=6.54, wps=459984, ups=1.06, wpb=434407, bsz=16331.1, num_updates=9400, lr=0.000652328, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=9308 epoch 006: 972 / 1689 loss=4.314, nll_loss=2.71, ppl=6.54, wps=459984, ups=1.06, wpb=434407, bsz=16331.1, num_updates=9400, lr=0.000652328, gnorm=0.25, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=9308 epoch 006: 1072 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=458899, ups=1.06, wpb=432759, bsz=16424.2, num_updates=9500, lr=0.000648886, gnorm=0.262, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=9402 epoch 006: 1072 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=458899, ups=1.06, wpb=432759, bsz=16424.2, num_updates=9500, lr=0.000648886, gnorm=0.262, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=9402 epoch 006: 1072 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=458899, ups=1.06, wpb=432759, bsz=16424.2, num_updates=9500, lr=0.000648886, gnorm=0.262, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=9402 epoch 006: 1072 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=458899, ups=1.06, wpb=432759, bsz=16424.2, num_updates=9500, lr=0.000648886, gnorm=0.262, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=9402 epoch 006: 1072 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=458899, ups=1.06, wpb=432759, bsz=16424.2, num_updates=9500, lr=0.000648886, gnorm=0.262, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=9402 epoch 006: 1072 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=458899, ups=1.06, wpb=432759, bsz=16424.2, num_updates=9500, lr=0.000648886, gnorm=0.262, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=9402 epoch 006: 1172 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=459407, ups=1.06, wpb=434441, bsz=16648.2, num_updates=9600, lr=0.000645497, gnorm=0.257, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=9496 epoch 006: 1172 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=459407, ups=1.06, wpb=434441, bsz=16648.2, num_updates=9600, lr=0.000645497, gnorm=0.257, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=9496 epoch 006: 1172 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=459407, ups=1.06, wpb=434441, bsz=16648.2, num_updates=9600, lr=0.000645497, gnorm=0.257, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=9496 epoch 006: 1172 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=459407, ups=1.06, wpb=434441, bsz=16648.2, num_updates=9600, lr=0.000645497, gnorm=0.257, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=9496 epoch 006: 1172 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=459407, ups=1.06, wpb=434441, bsz=16648.2, num_updates=9600, lr=0.000645497, gnorm=0.257, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=9496 epoch 006: 1172 / 1689 loss=4.309, nll_loss=2.705, ppl=6.52, wps=459407, ups=1.06, wpb=434441, bsz=16648.2, num_updates=9600, lr=0.000645497, gnorm=0.257, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=9496 epoch 006: 1272 / 1689 loss=4.302, nll_loss=2.697, ppl=6.49, wps=456973, ups=1.05, wpb=433171, bsz=16270.3, num_updates=9700, lr=0.000642161, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=9591 epoch 006: 1272 / 1689 loss=4.302, nll_loss=2.697, ppl=6.49, wps=456973, ups=1.05, wpb=433171, bsz=16270.3, num_updates=9700, lr=0.000642161, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=9591 epoch 006: 1272 / 1689 loss=4.302, nll_loss=2.697, ppl=6.49, wps=456973, ups=1.05, wpb=433171, bsz=16270.3, num_updates=9700, lr=0.000642161, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=9591 epoch 006: 1272 / 1689 loss=4.302, nll_loss=2.697, ppl=6.49, wps=456973, ups=1.05, wpb=433171, bsz=16270.3, num_updates=9700, lr=0.000642161, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=9591 epoch 006: 1272 / 1689 loss=4.302, nll_loss=2.697, ppl=6.49, wps=456973, ups=1.05, wpb=433171, bsz=16270.3, num_updates=9700, lr=0.000642161, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=9591 epoch 006: 1272 / 1689 loss=4.302, nll_loss=2.697, ppl=6.49, wps=456973, ups=1.05, wpb=433171, bsz=16270.3, num_updates=9700, lr=0.000642161, gnorm=0.245, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=9591 epoch 006: 1372 / 1689 loss=4.311, nll_loss=2.708, ppl=6.53, wps=456514, ups=1.05, wpb=432846, bsz=16658.9, num_updates=9800, lr=0.000638877, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=9686 epoch 006: 1372 / 1689 loss=4.311, nll_loss=2.708, ppl=6.53, wps=456514, ups=1.05, wpb=432846, bsz=16658.9, num_updates=9800, lr=0.000638877, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=9686 epoch 006: 1372 / 1689 loss=4.311, nll_loss=2.708, ppl=6.53, wps=456514, ups=1.05, wpb=432846, bsz=16658.9, num_updates=9800, lr=0.000638877, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=9686 epoch 006: 1372 / 1689 loss=4.311, nll_loss=2.708, ppl=6.53, wps=456514, ups=1.05, wpb=432846, bsz=16658.9, num_updates=9800, lr=0.000638877, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=9686 epoch 006: 1372 / 1689 loss=4.311, nll_loss=2.708, ppl=6.53, wps=456514, ups=1.05, wpb=432846, bsz=16658.9, num_updates=9800, lr=0.000638877, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=9686 epoch 006: 1372 / 1689 loss=4.311, nll_loss=2.708, ppl=6.53, wps=456514, ups=1.05, wpb=432846, bsz=16658.9, num_updates=9800, lr=0.000638877, gnorm=0.26, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=9686 epoch 006: 1472 / 1689 loss=4.313, nll_loss=2.71, ppl=6.54, wps=460477, ups=1.05, wpb=436472, bsz=16306.9, num_updates=9900, lr=0.000635642, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=9781 epoch 006: 1472 / 1689 loss=4.313, nll_loss=2.71, ppl=6.54, wps=460477, ups=1.05, wpb=436472, bsz=16306.9, num_updates=9900, lr=0.000635642, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=9781 epoch 006: 1472 / 1689 loss=4.313, nll_loss=2.71, ppl=6.54, wps=460477, ups=1.05, wpb=436472, bsz=16306.9, num_updates=9900, lr=0.000635642, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=9781 epoch 006: 1472 / 1689 loss=4.313, nll_loss=2.71, ppl=6.54, wps=460477, ups=1.05, wpb=436472, bsz=16306.9, num_updates=9900, lr=0.000635642, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=9781 epoch 006: 1472 / 1689 loss=4.313, nll_loss=2.71, ppl=6.54, wps=460477, ups=1.05, wpb=436472, bsz=16306.9, num_updates=9900, lr=0.000635642, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=9781 epoch 006: 1472 / 1689 loss=4.313, nll_loss=2.71, ppl=6.54, wps=460477, ups=1.05, wpb=436472, bsz=16306.9, num_updates=9900, lr=0.000635642, gnorm=0.245, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=9781 epoch 006: 1572 / 1689 loss=4.308, nll_loss=2.705, ppl=6.52, wps=459853, ups=1.06, wpb=435770, bsz=16715.4, num_updates=10000, lr=0.000632456, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=9876 epoch 006: 1572 / 1689 loss=4.308, nll_loss=2.705, ppl=6.52, wps=459853, ups=1.06, wpb=435770, bsz=16715.4, num_updates=10000, lr=0.000632456, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=9876 epoch 006: 1572 / 1689 loss=4.308, nll_loss=2.705, ppl=6.52, wps=459853, ups=1.06, wpb=435770, bsz=16715.4, num_updates=10000, lr=0.000632456, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=9876 epoch 006: 1572 / 1689 loss=4.308, nll_loss=2.705, ppl=6.52, wps=459853, ups=1.06, wpb=435770, bsz=16715.4, num_updates=10000, lr=0.000632456, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=9876 epoch 006: 1572 / 1689 loss=4.308, nll_loss=2.705, ppl=6.52, wps=459853, ups=1.06, wpb=435770, bsz=16715.4, num_updates=10000, lr=0.000632456, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=9876 epoch 006: 1572 / 1689 loss=4.308, nll_loss=2.705, ppl=6.52, wps=459853, ups=1.06, wpb=435770, bsz=16715.4, num_updates=10000, lr=0.000632456, gnorm=0.25, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=9876 begin validation on "valid" subset epoch 006 | valid on 'valid' subset | loss 4.362 | nll_loss 2.75 | ppl 6.73 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.362 epoch 006 | valid on 'valid' subset | loss 4.362 | nll_loss 2.75 | ppl 6.73 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.362 epoch 006 | valid on 'valid' subset | loss 4.362 | nll_loss 2.75 | ppl 6.73 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.362 epoch 006 | valid on 'valid' subset | loss 4.362 | nll_loss 2.75 | ppl 6.73 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.362 epoch 006 | valid on 'valid' subset | loss 4.362 | nll_loss 2.75 | ppl 6.73 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.362 epoch 006 | valid on 'valid' subset | loss 4.362 | nll_loss 2.75 | ppl 6.73 | wps 0 | wpb 42662 | bsz 2032 | num_updates 10000 | best_loss 4.362 epoch 006: 1672 / 1689 loss=4.291, nll_loss=2.687, ppl=6.44, wps=373386, ups=0.86, wpb=432330, bsz=16614.2, num_updates=10100, lr=0.000629317, gnorm=0.254, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=9991 epoch 006: 1672 / 1689 loss=4.291, nll_loss=2.687, ppl=6.44, wps=373386, ups=0.86, wpb=432330, bsz=16614.2, num_updates=10100, lr=0.000629317, gnorm=0.254, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=9991 epoch 006: 1672 / 1689 loss=4.291, nll_loss=2.687, ppl=6.44, wps=373386, ups=0.86, wpb=432330, bsz=16614.2, num_updates=10100, lr=0.000629317, gnorm=0.254, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=9991 epoch 006: 1672 / 1689 loss=4.291, nll_loss=2.687, ppl=6.44, wps=373386, ups=0.86, wpb=432330, bsz=16614.2, num_updates=10100, lr=0.000629317, gnorm=0.254, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=9991 epoch 006: 1672 / 1689 loss=4.291, nll_loss=2.687, ppl=6.44, wps=373386, ups=0.86, wpb=432330, bsz=16614.2, num_updates=10100, lr=0.000629317, gnorm=0.254, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=9991 epoch 006: 1672 / 1689 loss=4.291, nll_loss=2.687, ppl=6.44, wps=373386, ups=0.86, wpb=432330, bsz=16614.2, num_updates=10100, lr=0.000629317, gnorm=0.254, clip=0, loss_scale=2, train_wall=94, gb_free=18.1, wall=9991 end of epoch 6 (average epoch stats below) epoch 006 | loss 4.31 | nll_loss 2.706 | ppl 6.53 | wps 437692 | ups 1.01 | wpb 433524 | bsz 16507 | num_updates 10117 | lr 0.000628788 | gnorm 0.257 | clip 0.1 | loss_scale 2 | train_wall 1603 | gb_free 22.5 | wall 10006 epoch 006 | loss 4.31 | nll_loss 2.706 | ppl 6.53 | wps 437692 | ups 1.01 | wpb 433524 | bsz 16507 | num_updates 10117 | lr 0.000628788 | gnorm 0.257 | clip 0.1 | loss_scale 2 | train_wall 1603 | gb_free 22.5 | wall 10006 epoch 006 | loss 4.31 | nll_loss 2.706 | ppl 6.53 | wps 437692 | ups 1.01 | wpb 433524 | bsz 16507 | num_updates 10117 | lr 0.000628788 | gnorm 0.257 | clip 0.1 | loss_scale 2 | train_wall 1603 | gb_free 22.5 | wall 10006 epoch 006 | loss 4.31 | nll_loss 2.706 | ppl 6.53 | wps 437692 | ups 1.01 | wpb 433524 | bsz 16507 | num_updates 10117 | lr 0.000628788 | gnorm 0.257 | clip 0.1 | loss_scale 2 | train_wall 1603 | gb_free 22.5 | wall 10006 epoch 006 | loss 4.31 | nll_loss 2.706 | ppl 6.53 | wps 437692 | ups 1.01 | wpb 433524 | bsz 16507 | num_updates 10117 | lr 0.000628788 | gnorm 0.257 | clip 0.1 | loss_scale 2 | train_wall 1603 | gb_free 22.5 | wall 10006 epoch 006 | loss 4.31 | nll_loss 2.706 | ppl 6.53 | wps 437692 | ups 1.01 | wpb 433524 | bsz 16507 | num_updates 10117 | lr 0.000628788 | gnorm 0.257 | clip 0.1 | loss_scale 2 | train_wall 1603 | gb_free 22.5 | wall 10006 Start iterating over samples epoch 007: 83 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=447501, ups=1.04, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=10087 epoch 007: 83 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=447501, ups=1.04, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=10087 epoch 007: 83 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=447501, ups=1.04, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=10087 epoch 007: 83 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=447501, ups=1.04, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=10087 epoch 007: 83 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=447501, ups=1.04, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=10087 epoch 007: 83 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=447501, ups=1.04, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=10087 epoch 007: 83 / 1689 loss=4.257, nll_loss=2.646, ppl=6.26, wps=447501, ups=1.04, wpb=429530, bsz=16339.7, num_updates=10200, lr=0.000626224, gnorm=0.25, clip=0, loss_scale=2, train_wall=92, gb_free=19.7, wall=10087 epoch 007: 183 / 1689 loss=4.248, nll_loss=2.636, ppl=6.21, wps=457221, ups=1.05, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=10182 epoch 007: 183 / 1689 loss=4.248, nll_loss=2.636, ppl=6.21, wps=457221, ups=1.05, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=10182 epoch 007: 183 / 1689 loss=4.248, nll_loss=2.636, ppl=6.21, wps=457221, ups=1.05, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=10182 epoch 007: 183 / 1689 loss=4.248, nll_loss=2.636, ppl=6.21, wps=457221, ups=1.05, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=10182 epoch 007: 183 / 1689 loss=4.248, nll_loss=2.636, ppl=6.21, wps=457221, ups=1.05, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=10182 epoch 007: 183 / 1689 loss=4.248, nll_loss=2.636, ppl=6.21, wps=457221, ups=1.05, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=10182 epoch 007: 183 / 1689 loss=4.248, nll_loss=2.636, ppl=6.21, wps=457221, ups=1.05, wpb=433890, bsz=16604.3, num_updates=10300, lr=0.000623177, gnorm=0.251, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=10182 epoch 007: 283 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457650, ups=1.05, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=10277 epoch 007: 283 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457650, ups=1.05, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=10277 epoch 007: 283 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457650, ups=1.05, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=10277 epoch 007: 283 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457650, ups=1.05, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=10277 epoch 007: 283 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457650, ups=1.05, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=10277 epoch 007: 283 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457650, ups=1.05, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=10277 epoch 007: 283 / 1689 loss=4.256, nll_loss=2.645, ppl=6.26, wps=457650, ups=1.05, wpb=434226, bsz=16546.1, num_updates=10400, lr=0.000620174, gnorm=0.246, clip=0, loss_scale=4, train_wall=93, gb_free=19.3, wall=10277 epoch 007: 384 / 1689 loss=4.265, nll_loss=2.655, ppl=6.3, wps=457645, ups=1.05, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.239, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=10372 epoch 007: 384 / 1689 loss=4.265, nll_loss=2.655, ppl=6.3, wps=457645, ups=1.05, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.239, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=10372 epoch 007: 384 / 1689 loss=4.265, nll_loss=2.655, ppl=6.3, wps=457645, ups=1.05, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.239, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=10372 epoch 007: 384 / 1689 loss=4.265, nll_loss=2.655, ppl=6.3, wps=457645, ups=1.05, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.239, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=10372 epoch 007: 384 / 1689 loss=4.265, nll_loss=2.655, ppl=6.3, wps=457645, ups=1.05, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.239, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=10372 epoch 007: 384 / 1689 loss=4.265, nll_loss=2.655, ppl=6.3, wps=457645, ups=1.05, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.239, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=10372 epoch 007: 384 / 1689 loss=4.265, nll_loss=2.655, ppl=6.3, wps=457645, ups=1.05, wpb=434973, bsz=16076.7, num_updates=10500, lr=0.000617213, gnorm=0.239, clip=0, loss_scale=2, train_wall=94, gb_free=18.2, wall=10372 epoch 007: 484 / 1689 loss=4.258, nll_loss=2.647, ppl=6.27, wps=459906, ups=1.06, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=17.9, wall=10466 epoch 007: 484 / 1689 loss=4.258, nll_loss=2.647, ppl=6.27, wps=459906, ups=1.06, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=17.9, wall=10466 epoch 007: 484 / 1689 loss=4.258, nll_loss=2.647, ppl=6.27, wps=459906, ups=1.06, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=17.9, wall=10466 epoch 007: 484 / 1689 loss=4.258, nll_loss=2.647, ppl=6.27, wps=459906, ups=1.06, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=17.9, wall=10466 epoch 007: 484 / 1689 loss=4.258, nll_loss=2.647, ppl=6.27, wps=459906, ups=1.06, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=17.9, wall=10466 epoch 007: 484 / 1689 loss=4.258, nll_loss=2.647, ppl=6.27, wps=459906, ups=1.06, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=17.9, wall=10466 epoch 007: 484 / 1689 loss=4.258, nll_loss=2.647, ppl=6.27, wps=459906, ups=1.06, wpb=433566, bsz=16515, num_updates=10600, lr=0.000614295, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=17.9, wall=10466 epoch 007: 584 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=456341, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.252, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=10561 epoch 007: 584 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=456341, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.252, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=10561 epoch 007: 584 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=456341, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.252, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=10561 epoch 007: 584 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=456341, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.252, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=10561 epoch 007: 584 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=456341, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.252, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=10561 epoch 007: 584 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=456341, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.252, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=10561 epoch 007: 584 / 1689 loss=4.252, nll_loss=2.641, ppl=6.24, wps=456341, ups=1.06, wpb=431769, bsz=16446.7, num_updates=10700, lr=0.000611418, gnorm=0.252, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=10561 epoch 007: 684 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=457777, ups=1.05, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=10656 epoch 007: 684 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=457777, ups=1.05, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=10656 epoch 007: 684 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=457777, ups=1.05, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=10656 epoch 007: 684 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=457777, ups=1.05, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=10656 epoch 007: 684 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=457777, ups=1.05, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=10656 epoch 007: 684 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=457777, ups=1.05, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=10656 epoch 007: 684 / 1689 loss=4.256, nll_loss=2.646, ppl=6.26, wps=457777, ups=1.05, wpb=434025, bsz=16372.6, num_updates=10800, lr=0.000608581, gnorm=0.247, clip=0, loss_scale=2, train_wall=93, gb_free=18.7, wall=10656 epoch 007: 784 / 1689 loss=4.271, nll_loss=2.664, ppl=6.34, wps=458853, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=10750 epoch 007: 784 / 1689 loss=4.271, nll_loss=2.664, ppl=6.34, wps=458853, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=10750 epoch 007: 784 / 1689 loss=4.271, nll_loss=2.664, ppl=6.34, wps=458853, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=10750 epoch 007: 784 / 1689 loss=4.271, nll_loss=2.664, ppl=6.34, wps=458853, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=10750 epoch 007: 784 / 1689 loss=4.271, nll_loss=2.664, ppl=6.34, wps=458853, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=10750 epoch 007: 784 / 1689 loss=4.271, nll_loss=2.664, ppl=6.34, wps=458853, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=10750 epoch 007: 784 / 1689 loss=4.271, nll_loss=2.664, ppl=6.34, wps=458853, ups=1.06, wpb=433331, bsz=16368, num_updates=10900, lr=0.000605783, gnorm=0.244, clip=0, loss_scale=2, train_wall=93, gb_free=19.5, wall=10750 epoch 007: 885 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=451740, ups=1.04, wpb=432988, bsz=16412.2, num_updates=11000, lr=0.000603023, gnorm=0.245, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=10846 epoch 007: 885 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=451740, ups=1.04, wpb=432988, bsz=16412.2, num_updates=11000, lr=0.000603023, gnorm=0.245, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=10846 epoch 007: 885 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=451740, ups=1.04, wpb=432988, bsz=16412.2, num_updates=11000, lr=0.000603023, gnorm=0.245, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=10846 epoch 007: 885 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=451740, ups=1.04, wpb=432988, bsz=16412.2, num_updates=11000, lr=0.000603023, gnorm=0.245, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=10846 epoch 007: 885 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=451740, ups=1.04, wpb=432988, bsz=16412.2, num_updates=11000, lr=0.000603023, gnorm=0.245, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=10846 epoch 007: 885 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=451740, ups=1.04, wpb=432988, bsz=16412.2, num_updates=11000, lr=0.000603023, gnorm=0.245, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=10846 epoch 007: 885 / 1689 loss=4.271, nll_loss=2.663, ppl=6.34, wps=451740, ups=1.04, wpb=432988, bsz=16412.2, num_updates=11000, lr=0.000603023, gnorm=0.245, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=10846 begin validation on "valid" subset epoch 007 | valid on 'valid' subset | loss 4.349 | nll_loss 2.732 | ppl 6.64 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.349 epoch 007 | valid on 'valid' subset | loss 4.349 | nll_loss 2.732 | ppl 6.64 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.349 epoch 007 | valid on 'valid' subset | loss 4.349 | nll_loss 2.732 | ppl 6.64 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.349 epoch 007 | valid on 'valid' subset | loss 4.349 | nll_loss 2.732 | ppl 6.64 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.349 epoch 007 | valid on 'valid' subset | loss 4.349 | nll_loss 2.732 | ppl 6.64 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.349 epoch 007 | valid on 'valid' subset | loss 4.349 | nll_loss 2.732 | ppl 6.64 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.349 epoch 007 | valid on 'valid' subset | loss 4.349 | nll_loss 2.732 | ppl 6.64 | wps 0 | wpb 42662 | bsz 2032 | num_updates 11000 | best_loss 4.349 epoch 007: 985 / 1689 loss=4.255, nll_loss=2.646, ppl=6.26, wps=382828, ups=0.88, wpb=434408, bsz=16309.8, num_updates=11100, lr=0.0006003, gnorm=0.265, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=10960 epoch 007: 985 / 1689 loss=4.255, nll_loss=2.646, ppl=6.26, wps=382828, ups=0.88, wpb=434408, bsz=16309.8, num_updates=11100, lr=0.0006003, gnorm=0.265, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=10960 epoch 007: 985 / 1689 loss=4.255, nll_loss=2.646, ppl=6.26, wps=382828, ups=0.88, wpb=434408, bsz=16309.8, num_updates=11100, lr=0.0006003, gnorm=0.265, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=10960 epoch 007: 985 / 1689 loss=4.255, nll_loss=2.646, ppl=6.26, wps=382828, ups=0.88, wpb=434408, bsz=16309.8, num_updates=11100, lr=0.0006003, gnorm=0.265, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=10960 epoch 007: 985 / 1689 loss=4.255, nll_loss=2.646, ppl=6.26, wps=382828, ups=0.88, wpb=434408, bsz=16309.8, num_updates=11100, lr=0.0006003, gnorm=0.265, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=10960 epoch 007: 985 / 1689 loss=4.255, nll_loss=2.646, ppl=6.26, wps=382828, ups=0.88, wpb=434408, bsz=16309.8, num_updates=11100, lr=0.0006003, gnorm=0.265, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=10960 epoch 007: 985 / 1689 loss=4.255, nll_loss=2.646, ppl=6.26, wps=382828, ups=0.88, wpb=434408, bsz=16309.8, num_updates=11100, lr=0.0006003, gnorm=0.265, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=10960 epoch 007: 1085 / 1689 loss=4.251, nll_loss=2.642, ppl=6.24, wps=455045, ups=1.05, wpb=432249, bsz=17043.7, num_updates=11200, lr=0.000597614, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=11055 epoch 007: 1085 / 1689 loss=4.251, nll_loss=2.642, ppl=6.24, wps=455045, ups=1.05, wpb=432249, bsz=17043.7, num_updates=11200, lr=0.000597614, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=11055 epoch 007: 1085 / 1689 loss=4.251, nll_loss=2.642, ppl=6.24, wps=455045, ups=1.05, wpb=432249, bsz=17043.7, num_updates=11200, lr=0.000597614, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=11055 epoch 007: 1085 / 1689 loss=4.251, nll_loss=2.642, ppl=6.24, wps=455045, ups=1.05, wpb=432249, bsz=17043.7, num_updates=11200, lr=0.000597614, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=11055 epoch 007: 1085 / 1689 loss=4.251, nll_loss=2.642, ppl=6.24, wps=455045, ups=1.05, wpb=432249, bsz=17043.7, num_updates=11200, lr=0.000597614, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=11055 epoch 007: 1085 / 1689 loss=4.251, nll_loss=2.642, ppl=6.24, wps=455045, ups=1.05, wpb=432249, bsz=17043.7, num_updates=11200, lr=0.000597614, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=11055 epoch 007: 1085 / 1689 loss=4.251, nll_loss=2.642, ppl=6.24, wps=455045, ups=1.05, wpb=432249, bsz=17043.7, num_updates=11200, lr=0.000597614, gnorm=0.253, clip=0, loss_scale=1, train_wall=93, gb_free=17.7, wall=11055 epoch 007: 1185 / 1689 loss=4.261, nll_loss=2.653, ppl=6.29, wps=457906, ups=1.05, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=11150 epoch 007: 1185 / 1689 loss=4.261, nll_loss=2.653, ppl=6.29, wps=457906, ups=1.05, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=11150 epoch 007: 1185 / 1689 loss=4.261, nll_loss=2.653, ppl=6.29, wps=457906, ups=1.05, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=11150 epoch 007: 1185 / 1689 loss=4.261, nll_loss=2.653, ppl=6.29, wps=457906, ups=1.05, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=11150 epoch 007: 1185 / 1689 loss=4.261, nll_loss=2.653, ppl=6.29, wps=457906, ups=1.05, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=11150 epoch 007: 1185 / 1689 loss=4.261, nll_loss=2.653, ppl=6.29, wps=457906, ups=1.05, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=11150 epoch 007: 1185 / 1689 loss=4.261, nll_loss=2.653, ppl=6.29, wps=457906, ups=1.05, wpb=435346, bsz=16712.2, num_updates=11300, lr=0.000594964, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=18.3, wall=11150 epoch 007: 1285 / 1689 loss=4.261, nll_loss=2.652, ppl=6.29, wps=460540, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=11244 epoch 007: 1285 / 1689 loss=4.261, nll_loss=2.652, ppl=6.29, wps=460540, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=11244 epoch 007: 1285 / 1689 loss=4.261, nll_loss=2.652, ppl=6.29, wps=460540, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=11244 epoch 007: 1285 / 1689 loss=4.261, nll_loss=2.652, ppl=6.29, wps=460540, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=11244 epoch 007: 1285 / 1689 loss=4.261, nll_loss=2.652, ppl=6.29, wps=460540, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=11244 epoch 007: 1285 / 1689 loss=4.261, nll_loss=2.652, ppl=6.29, wps=460540, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=11244 epoch 007: 1285 / 1689 loss=4.261, nll_loss=2.652, ppl=6.29, wps=460540, ups=1.06, wpb=435433, bsz=16388, num_updates=11400, lr=0.000592349, gnorm=0.242, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=11244 epoch 007: 1385 / 1689 loss=4.271, nll_loss=2.664, ppl=6.34, wps=459559, ups=1.06, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=11339 epoch 007: 1385 / 1689 loss=4.271, nll_loss=2.664, ppl=6.34, wps=459559, ups=1.06, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=11339 epoch 007: 1385 / 1689 loss=4.271, nll_loss=2.664, ppl=6.34, wps=459559, ups=1.06, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=11339 epoch 007: 1385 / 1689 loss=4.271, nll_loss=2.664, ppl=6.34, wps=459559, ups=1.06, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=11339 epoch 007: 1385 / 1689 loss=4.271, nll_loss=2.664, ppl=6.34, wps=459559, ups=1.06, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=11339 epoch 007: 1385 / 1689 loss=4.271, nll_loss=2.664, ppl=6.34, wps=459559, ups=1.06, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=11339 epoch 007: 1385 / 1689 loss=4.271, nll_loss=2.664, ppl=6.34, wps=459559, ups=1.06, wpb=433577, bsz=16396.2, num_updates=11500, lr=0.000589768, gnorm=0.246, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=11339 epoch 007: 1485 / 1689 loss=4.269, nll_loss=2.662, ppl=6.33, wps=461495, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=11433 epoch 007: 1485 / 1689 loss=4.269, nll_loss=2.662, ppl=6.33, wps=461495, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=11433 epoch 007: 1485 / 1689 loss=4.269, nll_loss=2.662, ppl=6.33, wps=461495, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=11433 epoch 007: 1485 / 1689 loss=4.269, nll_loss=2.662, ppl=6.33, wps=461495, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=11433 epoch 007: 1485 / 1689 loss=4.269, nll_loss=2.662, ppl=6.33, wps=461495, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=11433 epoch 007: 1485 / 1689 loss=4.269, nll_loss=2.662, ppl=6.33, wps=461495, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=11433 epoch 007: 1485 / 1689 loss=4.269, nll_loss=2.662, ppl=6.33, wps=461495, ups=1.06, wpb=434879, bsz=16616.1, num_updates=11600, lr=0.00058722, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=20.1, wall=11433 epoch 007: 1585 / 1689 loss=4.261, nll_loss=2.654, ppl=6.29, wps=454704, ups=1.05, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.254, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=11528 epoch 007: 1585 / 1689 loss=4.261, nll_loss=2.654, ppl=6.29, wps=454704, ups=1.05, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.254, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=11528 epoch 007: 1585 / 1689 loss=4.261, nll_loss=2.654, ppl=6.29, wps=454704, ups=1.05, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.254, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=11528 epoch 007: 1585 / 1689 loss=4.261, nll_loss=2.654, ppl=6.29, wps=454704, ups=1.05, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.254, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=11528 epoch 007: 1585 / 1689 loss=4.261, nll_loss=2.654, ppl=6.29, wps=454704, ups=1.05, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.254, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=11528 epoch 007: 1585 / 1689 loss=4.261, nll_loss=2.654, ppl=6.29, wps=454704, ups=1.05, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.254, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=11528 epoch 007: 1585 / 1689 loss=4.261, nll_loss=2.654, ppl=6.29, wps=454704, ups=1.05, wpb=433746, bsz=16915.8, num_updates=11700, lr=0.000584705, gnorm=0.254, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=11528 epoch 007: 1685 / 1689 loss=4.255, nll_loss=2.647, ppl=6.26, wps=454954, ups=1.05, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11623 epoch 007: 1685 / 1689 loss=4.255, nll_loss=2.647, ppl=6.26, wps=454954, ups=1.05, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11623 epoch 007: 1685 / 1689 loss=4.255, nll_loss=2.647, ppl=6.26, wps=454954, ups=1.05, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11623 epoch 007: 1685 / 1689 loss=4.255, nll_loss=2.647, ppl=6.26, wps=454954, ups=1.05, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11623 epoch 007: 1685 / 1689 loss=4.255, nll_loss=2.647, ppl=6.26, wps=454954, ups=1.05, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11623 epoch 007: 1685 / 1689 loss=4.255, nll_loss=2.647, ppl=6.26, wps=454954, ups=1.05, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11623 epoch 007: 1685 / 1689 loss=4.255, nll_loss=2.647, ppl=6.26, wps=454954, ups=1.05, wpb=431976, bsz=16493.3, num_updates=11800, lr=0.000582223, gnorm=0.231, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11623 end of epoch 7 (average epoch stats below) epoch 007 | loss 4.259 | nll_loss 2.65 | ppl 6.28 | wps 451538 | ups 1.04 | wpb 433532 | bsz 16502.2 | num_updates 11804 | lr 0.000582124 | gnorm 0.245 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.6 | wall 11626 epoch 007 | loss 4.259 | nll_loss 2.65 | ppl 6.28 | wps 451538 | ups 1.04 | wpb 433532 | bsz 16502.2 | num_updates 11804 | lr 0.000582124 | gnorm 0.245 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.6 | wall 11626 epoch 007 | loss 4.259 | nll_loss 2.65 | ppl 6.28 | wps 451538 | ups 1.04 | wpb 433532 | bsz 16502.2 | num_updates 11804 | lr 0.000582124 | gnorm 0.245 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.6 | wall 11626 epoch 007 | loss 4.259 | nll_loss 2.65 | ppl 6.28 | wps 451538 | ups 1.04 | wpb 433532 | bsz 16502.2 | num_updates 11804 | lr 0.000582124 | gnorm 0.245 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.6 | wall 11626 epoch 007 | loss 4.259 | nll_loss 2.65 | ppl 6.28 | wps 451538 | ups 1.04 | wpb 433532 | bsz 16502.2 | num_updates 11804 | lr 0.000582124 | gnorm 0.245 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.6 | wall 11626 epoch 007 | loss 4.259 | nll_loss 2.65 | ppl 6.28 | wps 451538 | ups 1.04 | wpb 433532 | bsz 16502.2 | num_updates 11804 | lr 0.000582124 | gnorm 0.245 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.6 | wall 11626 epoch 007 | loss 4.259 | nll_loss 2.65 | ppl 6.28 | wps 451538 | ups 1.04 | wpb 433532 | bsz 16502.2 | num_updates 11804 | lr 0.000582124 | gnorm 0.245 | clip 0 | loss_scale 2 | train_wall 1568 | gb_free 19.6 | wall 11626 Start iterating over samples epoch 008: 96 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=451866, ups=1.05, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11719 epoch 008: 96 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=451866, ups=1.05, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11719 epoch 008: 96 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=451866, ups=1.05, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11719 epoch 008: 96 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=451866, ups=1.05, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11719 epoch 008: 96 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=451866, ups=1.05, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11719 epoch 008: 96 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=451866, ups=1.05, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11719 epoch 008: 96 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=451866, ups=1.05, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11719 epoch 008: 96 / 1689 loss=4.199, nll_loss=2.582, ppl=5.99, wps=451866, ups=1.05, wpb=430592, bsz=16555.9, num_updates=11900, lr=0.000579771, gnorm=0.243, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=11719 epoch 008: 197 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=456028, ups=1.05, wpb=434361, bsz=16555.2, num_updates=12000, lr=0.00057735, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=11814 epoch 008: 197 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=456028, ups=1.05, wpb=434361, bsz=16555.2, num_updates=12000, lr=0.00057735, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=11814 epoch 008: 197 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=456028, ups=1.05, wpb=434361, bsz=16555.2, num_updates=12000, lr=0.00057735, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=11814 epoch 008: 197 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=456028, ups=1.05, wpb=434361, bsz=16555.2, num_updates=12000, lr=0.00057735, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=11814 epoch 008: 197 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=456028, ups=1.05, wpb=434361, bsz=16555.2, num_updates=12000, lr=0.00057735, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=11814 epoch 008: 197 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=456028, ups=1.05, wpb=434361, bsz=16555.2, num_updates=12000, lr=0.00057735, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=11814 epoch 008: 197 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=456028, ups=1.05, wpb=434361, bsz=16555.2, num_updates=12000, lr=0.00057735, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=11814 epoch 008: 197 / 1689 loss=4.216, nll_loss=2.601, ppl=6.07, wps=456028, ups=1.05, wpb=434361, bsz=16555.2, num_updates=12000, lr=0.00057735, gnorm=0.225, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=11814 begin validation on "valid" subset epoch 008 | valid on 'valid' subset | loss 4.336 | nll_loss 2.721 | ppl 6.59 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.336 epoch 008 | valid on 'valid' subset | loss 4.336 | nll_loss 2.721 | ppl 6.59 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.336 epoch 008 | valid on 'valid' subset | loss 4.336 | nll_loss 2.721 | ppl 6.59 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.336 epoch 008 | valid on 'valid' subset | loss 4.336 | nll_loss 2.721 | ppl 6.59 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.336 epoch 008 | valid on 'valid' subset | loss 4.336 | nll_loss 2.721 | ppl 6.59 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.336 epoch 008 | valid on 'valid' subset | loss 4.336 | nll_loss 2.721 | ppl 6.59 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.336 epoch 008 | valid on 'valid' subset | loss 4.336 | nll_loss 2.721 | ppl 6.59 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.336 epoch 008 | valid on 'valid' subset | loss 4.336 | nll_loss 2.721 | ppl 6.59 | wps 0 | wpb 42662 | bsz 2032 | num_updates 12000 | best_loss 4.336 epoch 008: 297 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=380623, ups=0.88, wpb=434007, bsz=16606.2, num_updates=12100, lr=0.00057496, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=11928 epoch 008: 297 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=380623, ups=0.88, wpb=434007, bsz=16606.2, num_updates=12100, lr=0.00057496, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=11928 epoch 008: 297 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=380623, ups=0.88, wpb=434007, bsz=16606.2, num_updates=12100, lr=0.00057496, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=11928 epoch 008: 297 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=380623, ups=0.88, wpb=434007, bsz=16606.2, num_updates=12100, lr=0.00057496, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=11928 epoch 008: 297 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=380623, ups=0.88, wpb=434007, bsz=16606.2, num_updates=12100, lr=0.00057496, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=11928 epoch 008: 297 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=380623, ups=0.88, wpb=434007, bsz=16606.2, num_updates=12100, lr=0.00057496, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=11928 epoch 008: 297 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=380623, ups=0.88, wpb=434007, bsz=16606.2, num_updates=12100, lr=0.00057496, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=11928 epoch 008: 297 / 1689 loss=4.219, nll_loss=2.605, ppl=6.08, wps=380623, ups=0.88, wpb=434007, bsz=16606.2, num_updates=12100, lr=0.00057496, gnorm=0.242, clip=0, loss_scale=2, train_wall=92, gb_free=20, wall=11928 epoch 008: 397 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=461609, ups=1.06, wpb=434077, bsz=16623.3, num_updates=12200, lr=0.000572598, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=12022 epoch 008: 397 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=461609, ups=1.06, wpb=434077, bsz=16623.3, num_updates=12200, lr=0.000572598, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=12022 epoch 008: 397 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=461609, ups=1.06, wpb=434077, bsz=16623.3, num_updates=12200, lr=0.000572598, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=12022 epoch 008: 397 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=461609, ups=1.06, wpb=434077, bsz=16623.3, num_updates=12200, lr=0.000572598, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=12022 epoch 008: 397 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=461609, ups=1.06, wpb=434077, bsz=16623.3, num_updates=12200, lr=0.000572598, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=12022 epoch 008: 397 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=461609, ups=1.06, wpb=434077, bsz=16623.3, num_updates=12200, lr=0.000572598, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=12022 epoch 008: 397 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=461609, ups=1.06, wpb=434077, bsz=16623.3, num_updates=12200, lr=0.000572598, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=12022 epoch 008: 397 / 1689 loss=4.229, nll_loss=2.616, ppl=6.13, wps=461609, ups=1.06, wpb=434077, bsz=16623.3, num_updates=12200, lr=0.000572598, gnorm=0.24, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=12022 epoch 008: 499 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=449475, ups=1.04, wpb=431517, bsz=16093, num_updates=12300, lr=0.000570266, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.5, wall=12118 epoch 008: 499 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=449475, ups=1.04, wpb=431517, bsz=16093, num_updates=12300, lr=0.000570266, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.5, wall=12118 epoch 008: 499 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=449475, ups=1.04, wpb=431517, bsz=16093, num_updates=12300, lr=0.000570266, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.5, wall=12118 epoch 008: 499 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=449475, ups=1.04, wpb=431517, bsz=16093, num_updates=12300, lr=0.000570266, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.5, wall=12118 epoch 008: 499 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=449475, ups=1.04, wpb=431517, bsz=16093, num_updates=12300, lr=0.000570266, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.5, wall=12118 epoch 008: 499 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=449475, ups=1.04, wpb=431517, bsz=16093, num_updates=12300, lr=0.000570266, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.5, wall=12118 epoch 008: 499 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=449475, ups=1.04, wpb=431517, bsz=16093, num_updates=12300, lr=0.000570266, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.5, wall=12118 epoch 008: 499 / 1689 loss=4.208, nll_loss=2.593, ppl=6.03, wps=449475, ups=1.04, wpb=431517, bsz=16093, num_updates=12300, lr=0.000570266, gnorm=0.245, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.5, wall=12118 epoch 008: 599 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458486, ups=1.06, wpb=430508, bsz=16433.6, num_updates=12400, lr=0.000567962, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.9, wall=12212 epoch 008: 599 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458486, ups=1.06, wpb=430508, bsz=16433.6, num_updates=12400, lr=0.000567962, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.9, wall=12212 epoch 008: 599 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458486, ups=1.06, wpb=430508, bsz=16433.6, num_updates=12400, lr=0.000567962, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.9, wall=12212 epoch 008: 599 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458486, ups=1.06, wpb=430508, bsz=16433.6, num_updates=12400, lr=0.000567962, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.9, wall=12212 epoch 008: 599 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458486, ups=1.06, wpb=430508, bsz=16433.6, num_updates=12400, lr=0.000567962, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.9, wall=12212 epoch 008: 599 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458486, ups=1.06, wpb=430508, bsz=16433.6, num_updates=12400, lr=0.000567962, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.9, wall=12212 epoch 008: 599 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458486, ups=1.06, wpb=430508, bsz=16433.6, num_updates=12400, lr=0.000567962, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.9, wall=12212 epoch 008: 599 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=458486, ups=1.06, wpb=430508, bsz=16433.6, num_updates=12400, lr=0.000567962, gnorm=0.251, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.9, wall=12212 epoch 008: 699 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=452968, ups=1.05, wpb=432956, bsz=16626.4, num_updates=12500, lr=0.000565685, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=12307 epoch 008: 699 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=452968, ups=1.05, wpb=432956, bsz=16626.4, num_updates=12500, lr=0.000565685, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=12307 epoch 008: 699 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=452968, ups=1.05, wpb=432956, bsz=16626.4, num_updates=12500, lr=0.000565685, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=12307 epoch 008: 699 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=452968, ups=1.05, wpb=432956, bsz=16626.4, num_updates=12500, lr=0.000565685, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=12307 epoch 008: 699 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=452968, ups=1.05, wpb=432956, bsz=16626.4, num_updates=12500, lr=0.000565685, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=12307 epoch 008: 699 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=452968, ups=1.05, wpb=432956, bsz=16626.4, num_updates=12500, lr=0.000565685, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=12307 epoch 008: 699 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=452968, ups=1.05, wpb=432956, bsz=16626.4, num_updates=12500, lr=0.000565685, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=12307 epoch 008: 699 / 1689 loss=4.212, nll_loss=2.597, ppl=6.05, wps=452968, ups=1.05, wpb=432956, bsz=16626.4, num_updates=12500, lr=0.000565685, gnorm=0.238, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.3, wall=12307 epoch 008: 799 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459216, ups=1.06, wpb=434608, bsz=16338.8, num_updates=12600, lr=0.000563436, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=12402 epoch 008: 799 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459216, ups=1.06, wpb=434608, bsz=16338.8, num_updates=12600, lr=0.000563436, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=12402 epoch 008: 799 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459216, ups=1.06, wpb=434608, bsz=16338.8, num_updates=12600, lr=0.000563436, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=12402 epoch 008: 799 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459216, ups=1.06, wpb=434608, bsz=16338.8, num_updates=12600, lr=0.000563436, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=12402 epoch 008: 799 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459216, ups=1.06, wpb=434608, bsz=16338.8, num_updates=12600, lr=0.000563436, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=12402 epoch 008: 799 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459216, ups=1.06, wpb=434608, bsz=16338.8, num_updates=12600, lr=0.000563436, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=12402 epoch 008: 799 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459216, ups=1.06, wpb=434608, bsz=16338.8, num_updates=12600, lr=0.000563436, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=12402 epoch 008: 799 / 1689 loss=4.223, nll_loss=2.609, ppl=6.1, wps=459216, ups=1.06, wpb=434608, bsz=16338.8, num_updates=12600, lr=0.000563436, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=12402 epoch 008: 899 / 1689 loss=4.235, nll_loss=2.624, ppl=6.16, wps=460614, ups=1.06, wpb=434798, bsz=16583, num_updates=12700, lr=0.000561214, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=12496 epoch 008: 899 / 1689 loss=4.235, nll_loss=2.624, ppl=6.16, wps=460614, ups=1.06, wpb=434798, bsz=16583, num_updates=12700, lr=0.000561214, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=12496 epoch 008: 899 / 1689 loss=4.235, nll_loss=2.624, ppl=6.16, wps=460614, ups=1.06, wpb=434798, bsz=16583, num_updates=12700, lr=0.000561214, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=12496 epoch 008: 899 / 1689 loss=4.235, nll_loss=2.624, ppl=6.16, wps=460614, ups=1.06, wpb=434798, bsz=16583, num_updates=12700, lr=0.000561214, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=12496 epoch 008: 899 / 1689 loss=4.235, nll_loss=2.624, ppl=6.16, wps=460614, ups=1.06, wpb=434798, bsz=16583, num_updates=12700, lr=0.000561214, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=12496 epoch 008: 899 / 1689 loss=4.235, nll_loss=2.624, ppl=6.16, wps=460614, ups=1.06, wpb=434798, bsz=16583, num_updates=12700, lr=0.000561214, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=12496 epoch 008: 899 / 1689 loss=4.235, nll_loss=2.624, ppl=6.16, wps=460614, ups=1.06, wpb=434798, bsz=16583, num_updates=12700, lr=0.000561214, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=12496 epoch 008: 899 / 1689 loss=4.235, nll_loss=2.624, ppl=6.16, wps=460614, ups=1.06, wpb=434798, bsz=16583, num_updates=12700, lr=0.000561214, gnorm=0.24, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=12496 epoch 008: 999 / 1689 loss=4.236, nll_loss=2.625, ppl=6.17, wps=462845, ups=1.06, wpb=436695, bsz=16335.3, num_updates=12800, lr=0.000559017, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=12591 epoch 008: 999 / 1689 loss=4.236, nll_loss=2.625, ppl=6.17, wps=462845, ups=1.06, wpb=436695, bsz=16335.3, num_updates=12800, lr=0.000559017, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=12591 epoch 008: 999 / 1689 loss=4.236, nll_loss=2.625, ppl=6.17, wps=462845, ups=1.06, wpb=436695, bsz=16335.3, num_updates=12800, lr=0.000559017, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=12591 epoch 008: 999 / 1689 loss=4.236, nll_loss=2.625, ppl=6.17, wps=462845, ups=1.06, wpb=436695, bsz=16335.3, num_updates=12800, lr=0.000559017, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=12591 epoch 008: 999 / 1689 loss=4.236, nll_loss=2.625, ppl=6.17, wps=462845, ups=1.06, wpb=436695, bsz=16335.3, num_updates=12800, lr=0.000559017, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=12591 epoch 008: 999 / 1689 loss=4.236, nll_loss=2.625, ppl=6.17, wps=462845, ups=1.06, wpb=436695, bsz=16335.3, num_updates=12800, lr=0.000559017, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=12591 epoch 008: 999 / 1689 loss=4.236, nll_loss=2.625, ppl=6.17, wps=462845, ups=1.06, wpb=436695, bsz=16335.3, num_updates=12800, lr=0.000559017, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=12591 epoch 008: 999 / 1689 loss=4.236, nll_loss=2.625, ppl=6.17, wps=462845, ups=1.06, wpb=436695, bsz=16335.3, num_updates=12800, lr=0.000559017, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=12591 epoch 008: 1099 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=463778, ups=1.07, wpb=434267, bsz=16446.5, num_updates=12900, lr=0.000556846, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=12684 epoch 008: 1099 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=463778, ups=1.07, wpb=434267, bsz=16446.5, num_updates=12900, lr=0.000556846, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=12684 epoch 008: 1099 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=463778, ups=1.07, wpb=434267, bsz=16446.5, num_updates=12900, lr=0.000556846, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=12684 epoch 008: 1099 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=463778, ups=1.07, wpb=434267, bsz=16446.5, num_updates=12900, lr=0.000556846, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=12684 epoch 008: 1099 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=463778, ups=1.07, wpb=434267, bsz=16446.5, num_updates=12900, lr=0.000556846, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=12684 epoch 008: 1099 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=463778, ups=1.07, wpb=434267, bsz=16446.5, num_updates=12900, lr=0.000556846, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=12684 epoch 008: 1099 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=463778, ups=1.07, wpb=434267, bsz=16446.5, num_updates=12900, lr=0.000556846, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=12684 epoch 008: 1099 / 1689 loss=4.22, nll_loss=2.607, ppl=6.09, wps=463778, ups=1.07, wpb=434267, bsz=16446.5, num_updates=12900, lr=0.000556846, gnorm=0.24, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=12684 epoch 008: 1199 / 1689 loss=4.219, nll_loss=2.607, ppl=6.09, wps=462233, ups=1.07, wpb=433860, bsz=16255.4, num_updates=13000, lr=0.0005547, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=12778 epoch 008: 1199 / 1689 loss=4.219, nll_loss=2.607, ppl=6.09, wps=462233, ups=1.07, wpb=433860, bsz=16255.4, num_updates=13000, lr=0.0005547, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=12778 epoch 008: 1199 / 1689 loss=4.219, nll_loss=2.607, ppl=6.09, wps=462233, ups=1.07, wpb=433860, bsz=16255.4, num_updates=13000, lr=0.0005547, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=12778 epoch 008: 1199 / 1689 loss=4.219, nll_loss=2.607, ppl=6.09, wps=462233, ups=1.07, wpb=433860, bsz=16255.4, num_updates=13000, lr=0.0005547, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=12778 epoch 008: 1199 / 1689 loss=4.219, nll_loss=2.607, ppl=6.09, wps=462233, ups=1.07, wpb=433860, bsz=16255.4, num_updates=13000, lr=0.0005547, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=12778 epoch 008: 1199 / 1689 loss=4.219, nll_loss=2.607, ppl=6.09, wps=462233, ups=1.07, wpb=433860, bsz=16255.4, num_updates=13000, lr=0.0005547, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=12778 epoch 008: 1199 / 1689 loss=4.219, nll_loss=2.607, ppl=6.09, wps=462233, ups=1.07, wpb=433860, bsz=16255.4, num_updates=13000, lr=0.0005547, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=12778 epoch 008: 1199 / 1689 loss=4.219, nll_loss=2.607, ppl=6.09, wps=462233, ups=1.07, wpb=433860, bsz=16255.4, num_updates=13000, lr=0.0005547, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=12778 begin validation on "valid" subset epoch 008 | valid on 'valid' subset | loss 4.325 | nll_loss 2.712 | ppl 6.55 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.325 epoch 008 | valid on 'valid' subset | loss 4.325 | nll_loss 2.712 | ppl 6.55 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.325 epoch 008 | valid on 'valid' subset | loss 4.325 | nll_loss 2.712 | ppl 6.55 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.325 epoch 008 | valid on 'valid' subset | loss 4.325 | nll_loss 2.712 | ppl 6.55 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.325 epoch 008 | valid on 'valid' subset | loss 4.325 | nll_loss 2.712 | ppl 6.55 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.325 epoch 008 | valid on 'valid' subset | loss 4.325 | nll_loss 2.712 | ppl 6.55 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.325 epoch 008 | valid on 'valid' subset | loss 4.325 | nll_loss 2.712 | ppl 6.55 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.325 epoch 008 | valid on 'valid' subset | loss 4.325 | nll_loss 2.712 | ppl 6.55 | wps 0 | wpb 42662 | bsz 2032 | num_updates 13000 | best_loss 4.325 epoch 008: 1299 / 1689 loss=4.226, nll_loss=2.615, ppl=6.12, wps=115915, ups=0.27, wpb=434826, bsz=16768.1, num_updates=13100, lr=0.000552579, gnorm=0.23, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=13153 epoch 008: 1299 / 1689 loss=4.226, nll_loss=2.615, ppl=6.12, wps=115915, ups=0.27, wpb=434826, bsz=16768.1, num_updates=13100, lr=0.000552579, gnorm=0.23, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=13153 epoch 008: 1299 / 1689 loss=4.226, nll_loss=2.615, ppl=6.12, wps=115915, ups=0.27, wpb=434826, bsz=16768.1, num_updates=13100, lr=0.000552579, gnorm=0.23, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=13153 epoch 008: 1299 / 1689 loss=4.226, nll_loss=2.615, ppl=6.12, wps=115915, ups=0.27, wpb=434826, bsz=16768.1, num_updates=13100, lr=0.000552579, gnorm=0.23, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=13153 epoch 008: 1299 / 1689 loss=4.226, nll_loss=2.615, ppl=6.12, wps=115915, ups=0.27, wpb=434826, bsz=16768.1, num_updates=13100, lr=0.000552579, gnorm=0.23, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=13153 epoch 008: 1299 / 1689 loss=4.226, nll_loss=2.615, ppl=6.12, wps=115915, ups=0.27, wpb=434826, bsz=16768.1, num_updates=13100, lr=0.000552579, gnorm=0.23, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=13153 epoch 008: 1299 / 1689 loss=4.226, nll_loss=2.615, ppl=6.12, wps=115915, ups=0.27, wpb=434826, bsz=16768.1, num_updates=13100, lr=0.000552579, gnorm=0.23, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=13153 epoch 008: 1299 / 1689 loss=4.226, nll_loss=2.615, ppl=6.12, wps=115915, ups=0.27, wpb=434826, bsz=16768.1, num_updates=13100, lr=0.000552579, gnorm=0.23, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=13153 epoch 008: 1399 / 1689 loss=4.225, nll_loss=2.613, ppl=6.12, wps=465134, ups=1.08, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=13246 epoch 008: 1399 / 1689 loss=4.225, nll_loss=2.613, ppl=6.12, wps=465134, ups=1.08, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=13246 epoch 008: 1399 / 1689 loss=4.225, nll_loss=2.613, ppl=6.12, wps=465134, ups=1.08, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=13246 epoch 008: 1399 / 1689 loss=4.225, nll_loss=2.613, ppl=6.12, wps=465134, ups=1.08, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=13246 epoch 008: 1399 / 1689 loss=4.225, nll_loss=2.613, ppl=6.12, wps=465134, ups=1.08, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=13246 epoch 008: 1399 / 1689 loss=4.225, nll_loss=2.613, ppl=6.12, wps=465134, ups=1.08, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=13246 epoch 008: 1399 / 1689 loss=4.225, nll_loss=2.613, ppl=6.12, wps=465134, ups=1.08, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=13246 epoch 008: 1399 / 1689 loss=4.225, nll_loss=2.613, ppl=6.12, wps=465134, ups=1.08, wpb=432574, bsz=16717.1, num_updates=13200, lr=0.000550482, gnorm=0.234, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=13246 epoch 008: 1499 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=469399, ups=1.08, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=13339 epoch 008: 1499 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=469399, ups=1.08, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=13339 epoch 008: 1499 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=469399, ups=1.08, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=13339 epoch 008: 1499 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=469399, ups=1.08, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=13339 epoch 008: 1499 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=469399, ups=1.08, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=13339 epoch 008: 1499 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=469399, ups=1.08, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=13339 epoch 008: 1499 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=469399, ups=1.08, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=13339 epoch 008: 1499 / 1689 loss=4.218, nll_loss=2.605, ppl=6.08, wps=469399, ups=1.08, wpb=434046, bsz=16455.8, num_updates=13300, lr=0.000548408, gnorm=0.235, clip=0, loss_scale=2, train_wall=92, gb_free=19.5, wall=13339 epoch 008: 1599 / 1689 loss=4.219, nll_loss=2.607, ppl=6.09, wps=467357, ups=1.08, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=13431 epoch 008: 1599 / 1689 loss=4.219, nll_loss=2.607, ppl=6.09, wps=467357, ups=1.08, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=13431 epoch 008: 1599 / 1689 loss=4.219, nll_loss=2.607, ppl=6.09, wps=467357, ups=1.08, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=13431 epoch 008: 1599 / 1689 loss=4.219, nll_loss=2.607, ppl=6.09, wps=467357, ups=1.08, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=13431 epoch 008: 1599 / 1689 loss=4.219, nll_loss=2.607, ppl=6.09, wps=467357, ups=1.08, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=13431 epoch 008: 1599 / 1689 loss=4.219, nll_loss=2.607, ppl=6.09, wps=467357, ups=1.08, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=13431 epoch 008: 1599 / 1689 loss=4.219, nll_loss=2.607, ppl=6.09, wps=467357, ups=1.08, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=13431 epoch 008: 1599 / 1689 loss=4.219, nll_loss=2.607, ppl=6.09, wps=467357, ups=1.08, wpb=432191, bsz=16599.3, num_updates=13400, lr=0.000546358, gnorm=0.245, clip=0, loss_scale=2, train_wall=92, gb_free=18.9, wall=13431 end of epoch 8 (average epoch stats below) epoch 008 | loss 4.22 | nll_loss 2.607 | ppl 6.09 | wps 386875 | ups 0.89 | wpb 433550 | bsz 16509.2 | num_updates 13490 | lr 0.000544533 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 20.2 | wall 13516 epoch 008 | loss 4.22 | nll_loss 2.607 | ppl 6.09 | wps 386875 | ups 0.89 | wpb 433550 | bsz 16509.2 | num_updates 13490 | lr 0.000544533 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 20.2 | wall 13516 epoch 008 | loss 4.22 | nll_loss 2.607 | ppl 6.09 | wps 386875 | ups 0.89 | wpb 433550 | bsz 16509.2 | num_updates 13490 | lr 0.000544533 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 20.2 | wall 13516 epoch 008 | loss 4.22 | nll_loss 2.607 | ppl 6.09 | wps 386875 | ups 0.89 | wpb 433550 | bsz 16509.2 | num_updates 13490 | lr 0.000544533 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 20.2 | wall 13516 epoch 008 | loss 4.22 | nll_loss 2.607 | ppl 6.09 | wps 386875 | ups 0.89 | wpb 433550 | bsz 16509.2 | num_updates 13490 | lr 0.000544533 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 20.2 | wall 13516 epoch 008 | loss 4.22 | nll_loss 2.607 | ppl 6.09 | wps 386875 | ups 0.89 | wpb 433550 | bsz 16509.2 | num_updates 13490 | lr 0.000544533 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 20.2 | wall 13516 epoch 008 | loss 4.22 | nll_loss 2.607 | ppl 6.09 | wps 386875 | ups 0.89 | wpb 433550 | bsz 16509.2 | num_updates 13490 | lr 0.000544533 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 20.2 | wall 13516 epoch 008 | loss 4.22 | nll_loss 2.607 | ppl 6.09 | wps 386875 | ups 0.89 | wpb 433550 | bsz 16509.2 | num_updates 13490 | lr 0.000544533 | gnorm 0.237 | clip 0 | loss_scale 2 | train_wall 1566 | gb_free 20.2 | wall 13516 Start iterating over samples epoch 009: 10 / 1689 loss=4.213, nll_loss=2.6, ppl=6.06, wps=457390, ups=1.06, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=13526 epoch 009: 10 / 1689 loss=4.213, nll_loss=2.6, ppl=6.06, wps=457390, ups=1.06, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=13526 epoch 009: 10 / 1689 loss=4.213, nll_loss=2.6, ppl=6.06, wps=457390, ups=1.06, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=13526 epoch 009: 10 / 1689 loss=4.213, nll_loss=2.6, ppl=6.06, wps=457390, ups=1.06, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=13526 epoch 009: 10 / 1689 loss=4.213, nll_loss=2.6, ppl=6.06, wps=457390, ups=1.06, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=13526 epoch 009: 10 / 1689 loss=4.213, nll_loss=2.6, ppl=6.06, wps=457390, ups=1.06, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=13526 epoch 009: 10 / 1689 loss=4.213, nll_loss=2.6, ppl=6.06, wps=457390, ups=1.06, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=13526 epoch 009: 10 / 1689 loss=4.213, nll_loss=2.6, ppl=6.06, wps=457390, ups=1.06, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=13526 epoch 009: 10 / 1689 loss=4.213, nll_loss=2.6, ppl=6.06, wps=457390, ups=1.06, wpb=431027, bsz=16471.4, num_updates=13500, lr=0.000544331, gnorm=0.242, clip=0, loss_scale=2, train_wall=93, gb_free=20.3, wall=13526 epoch 009: 110 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=458972, ups=1.06, wpb=431261, bsz=16589.8, num_updates=13600, lr=0.000542326, gnorm=0.23, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13620 epoch 009: 110 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=458972, ups=1.06, wpb=431261, bsz=16589.8, num_updates=13600, lr=0.000542326, gnorm=0.23, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13620 epoch 009: 110 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=458972, ups=1.06, wpb=431261, bsz=16589.8, num_updates=13600, lr=0.000542326, gnorm=0.23, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13620 epoch 009: 110 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=458972, ups=1.06, wpb=431261, bsz=16589.8, num_updates=13600, lr=0.000542326, gnorm=0.23, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13620 epoch 009: 110 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=458972, ups=1.06, wpb=431261, bsz=16589.8, num_updates=13600, lr=0.000542326, gnorm=0.23, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13620 epoch 009: 110 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=458972, ups=1.06, wpb=431261, bsz=16589.8, num_updates=13600, lr=0.000542326, gnorm=0.23, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13620 epoch 009: 110 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=458972, ups=1.06, wpb=431261, bsz=16589.8, num_updates=13600, lr=0.000542326, gnorm=0.23, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13620 epoch 009: 110 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=458972, ups=1.06, wpb=431261, bsz=16589.8, num_updates=13600, lr=0.000542326, gnorm=0.23, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13620 epoch 009: 110 / 1689 loss=4.166, nll_loss=2.545, ppl=5.84, wps=458972, ups=1.06, wpb=431261, bsz=16589.8, num_updates=13600, lr=0.000542326, gnorm=0.23, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13620 epoch 009: 210 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=463051, ups=1.07, wpb=434142, bsz=16687.8, num_updates=13700, lr=0.000540343, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13713 epoch 009: 210 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=463051, ups=1.07, wpb=434142, bsz=16687.8, num_updates=13700, lr=0.000540343, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13713 epoch 009: 210 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=463051, ups=1.07, wpb=434142, bsz=16687.8, num_updates=13700, lr=0.000540343, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13713 epoch 009: 210 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=463051, ups=1.07, wpb=434142, bsz=16687.8, num_updates=13700, lr=0.000540343, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13713 epoch 009: 210 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=463051, ups=1.07, wpb=434142, bsz=16687.8, num_updates=13700, lr=0.000540343, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13713 epoch 009: 210 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=463051, ups=1.07, wpb=434142, bsz=16687.8, num_updates=13700, lr=0.000540343, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13713 epoch 009: 210 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=463051, ups=1.07, wpb=434142, bsz=16687.8, num_updates=13700, lr=0.000540343, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13713 epoch 009: 210 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=463051, ups=1.07, wpb=434142, bsz=16687.8, num_updates=13700, lr=0.000540343, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13713 epoch 009: 210 / 1689 loss=4.182, nll_loss=2.563, ppl=5.91, wps=463051, ups=1.07, wpb=434142, bsz=16687.8, num_updates=13700, lr=0.000540343, gnorm=0.237, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=13713 epoch 009: 311 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=456350, ups=1.05, wpb=435067, bsz=16495.4, num_updates=13800, lr=0.000538382, gnorm=0.228, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=13809 epoch 009: 311 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=456350, ups=1.05, wpb=435067, bsz=16495.4, num_updates=13800, lr=0.000538382, gnorm=0.228, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=13809 epoch 009: 311 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=456350, ups=1.05, wpb=435067, bsz=16495.4, num_updates=13800, lr=0.000538382, gnorm=0.228, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=13809 epoch 009: 311 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=456350, ups=1.05, wpb=435067, bsz=16495.4, num_updates=13800, lr=0.000538382, gnorm=0.228, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=13809 epoch 009: 311 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=456350, ups=1.05, wpb=435067, bsz=16495.4, num_updates=13800, lr=0.000538382, gnorm=0.228, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=13809 epoch 009: 311 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=456350, ups=1.05, wpb=435067, bsz=16495.4, num_updates=13800, lr=0.000538382, gnorm=0.228, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=13809 epoch 009: 311 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=456350, ups=1.05, wpb=435067, bsz=16495.4, num_updates=13800, lr=0.000538382, gnorm=0.228, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=13809 epoch 009: 311 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=456350, ups=1.05, wpb=435067, bsz=16495.4, num_updates=13800, lr=0.000538382, gnorm=0.228, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=13809 epoch 009: 311 / 1689 loss=4.189, nll_loss=2.571, ppl=5.94, wps=456350, ups=1.05, wpb=435067, bsz=16495.4, num_updates=13800, lr=0.000538382, gnorm=0.228, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=13809 epoch 009: 411 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=457665, ups=1.05, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.226, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=13904 epoch 009: 411 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=457665, ups=1.05, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.226, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=13904 epoch 009: 411 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=457665, ups=1.05, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.226, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=13904 epoch 009: 411 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=457665, ups=1.05, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.226, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=13904 epoch 009: 411 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=457665, ups=1.05, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.226, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=13904 epoch 009: 411 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=457665, ups=1.05, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.226, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=13904 epoch 009: 411 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=457665, ups=1.05, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.226, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=13904 epoch 009: 411 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=457665, ups=1.05, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.226, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=13904 epoch 009: 411 / 1689 loss=4.183, nll_loss=2.565, ppl=5.92, wps=457665, ups=1.05, wpb=435564, bsz=16321.4, num_updates=13900, lr=0.000536442, gnorm=0.226, clip=0, loss_scale=2, train_wall=94, gb_free=19.3, wall=13904 epoch 009: 511 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455480, ups=1.05, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=13999 epoch 009: 511 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455480, ups=1.05, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=13999 epoch 009: 511 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455480, ups=1.05, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=13999 epoch 009: 511 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455480, ups=1.05, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=13999 epoch 009: 511 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455480, ups=1.05, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=13999 epoch 009: 511 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455480, ups=1.05, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=13999 epoch 009: 511 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455480, ups=1.05, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=13999 epoch 009: 511 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455480, ups=1.05, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=13999 epoch 009: 511 / 1689 loss=4.186, nll_loss=2.568, ppl=5.93, wps=455480, ups=1.05, wpb=432306, bsz=16522.7, num_updates=14000, lr=0.000534522, gnorm=0.238, clip=0, loss_scale=2, train_wall=93, gb_free=19.7, wall=13999 begin validation on "valid" subset epoch 009 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.317 epoch 009 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.317 epoch 009 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.317 epoch 009 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.317 epoch 009 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.317 epoch 009 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.317 epoch 009 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.317 epoch 009 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.317 epoch 009 | valid on 'valid' subset | loss 4.317 | nll_loss 2.696 | ppl 6.48 | wps 0 | wpb 42662 | bsz 2032 | num_updates 14000 | best_loss 4.317 epoch 009: 611 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=373956, ups=0.86, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=14115 epoch 009: 611 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=373956, ups=0.86, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=14115 epoch 009: 611 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=373956, ups=0.86, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=14115 epoch 009: 611 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=373956, ups=0.86, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=14115 epoch 009: 611 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=373956, ups=0.86, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=14115 epoch 009: 611 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=373956, ups=0.86, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=14115 epoch 009: 611 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=373956, ups=0.86, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=14115 epoch 009: 611 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=373956, ups=0.86, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=14115 epoch 009: 611 / 1689 loss=4.194, nll_loss=2.578, ppl=5.97, wps=373956, ups=0.86, wpb=433970, bsz=16257.6, num_updates=14100, lr=0.000532624, gnorm=0.235, clip=0, loss_scale=2, train_wall=93, gb_free=19.1, wall=14115 epoch 009: 711 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=460903, ups=1.06, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=14209 epoch 009: 711 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=460903, ups=1.06, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=14209 epoch 009: 711 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=460903, ups=1.06, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=14209 epoch 009: 711 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=460903, ups=1.06, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=14209 epoch 009: 711 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=460903, ups=1.06, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=14209 epoch 009: 711 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=460903, ups=1.06, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=14209 epoch 009: 711 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=460903, ups=1.06, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=14209 epoch 009: 711 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=460903, ups=1.06, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=14209 epoch 009: 711 / 1689 loss=4.2, nll_loss=2.585, ppl=6, wps=460903, ups=1.06, wpb=435390, bsz=16517.4, num_updates=14200, lr=0.000530745, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20.4, wall=14209 epoch 009: 812 / 1689 loss=4.188, nll_loss=2.572, ppl=5.94, wps=451426, ups=1.04, wpb=432743, bsz=16618.8, num_updates=14300, lr=0.000528886, gnorm=0.236, clip=0, loss_scale=1, train_wall=94, gb_free=19.6, wall=14305 epoch 009: 812 / 1689 loss=4.188, nll_loss=2.572, ppl=5.94, wps=451426, ups=1.04, wpb=432743, bsz=16618.8, num_updates=14300, lr=0.000528886, gnorm=0.236, clip=0, loss_scale=1, train_wall=94, gb_free=19.6, wall=14305 epoch 009: 812 / 1689 loss=4.188, nll_loss=2.572, ppl=5.94, wps=451426, ups=1.04, wpb=432743, bsz=16618.8, num_updates=14300, lr=0.000528886, gnorm=0.236, clip=0, loss_scale=1, train_wall=94, gb_free=19.6, wall=14305 epoch 009: 812 / 1689 loss=4.188, nll_loss=2.572, ppl=5.94, wps=451426, ups=1.04, wpb=432743, bsz=16618.8, num_updates=14300, lr=0.000528886, gnorm=0.236, clip=0, loss_scale=1, train_wall=94, gb_free=19.6, wall=14305 epoch 009: 812 / 1689 loss=4.188, nll_loss=2.572, ppl=5.94, wps=451426, ups=1.04, wpb=432743, bsz=16618.8, num_updates=14300, lr=0.000528886, gnorm=0.236, clip=0, loss_scale=1, train_wall=94, gb_free=19.6, wall=14305 epoch 009: 812 / 1689 loss=4.188, nll_loss=2.572, ppl=5.94, wps=451426, ups=1.04, wpb=432743, bsz=16618.8, num_updates=14300, lr=0.000528886, gnorm=0.236, clip=0, loss_scale=1, train_wall=94, gb_free=19.6, wall=14305 epoch 009: 812 / 1689 loss=4.188, nll_loss=2.572, ppl=5.94, wps=451426, ups=1.04, wpb=432743, bsz=16618.8, num_updates=14300, lr=0.000528886, gnorm=0.236, clip=0, loss_scale=1, train_wall=94, gb_free=19.6, wall=14305 epoch 009: 812 / 1689 loss=4.188, nll_loss=2.572, ppl=5.94, wps=451426, ups=1.04, wpb=432743, bsz=16618.8, num_updates=14300, lr=0.000528886, gnorm=0.236, clip=0, loss_scale=1, train_wall=94, gb_free=19.6, wall=14305 epoch 009: 812 / 1689 loss=4.188, nll_loss=2.572, ppl=5.94, wps=451426, ups=1.04, wpb=432743, bsz=16618.8, num_updates=14300, lr=0.000528886, gnorm=0.236, clip=0, loss_scale=1, train_wall=94, gb_free=19.6, wall=14305 epoch 009: 912 / 1689 loss=4.188, nll_loss=2.572, ppl=5.94, wps=457563, ups=1.05, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=14400 epoch 009: 912 / 1689 loss=4.188, nll_loss=2.572, ppl=5.94, wps=457563, ups=1.05, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=14400 epoch 009: 912 / 1689 loss=4.188, nll_loss=2.572, ppl=5.94, wps=457563, ups=1.05, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=14400 epoch 009: 912 / 1689 loss=4.188, nll_loss=2.572, ppl=5.94, wps=457563, ups=1.05, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=14400 epoch 009: 912 / 1689 loss=4.188, nll_loss=2.572, ppl=5.94, wps=457563, ups=1.05, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=14400 epoch 009: 912 / 1689 loss=4.188, nll_loss=2.572, ppl=5.94, wps=457563, ups=1.05, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=14400 epoch 009: 912 / 1689 loss=4.188, nll_loss=2.572, ppl=5.94, wps=457563, ups=1.05, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=14400 epoch 009: 912 / 1689 loss=4.188, nll_loss=2.572, ppl=5.94, wps=457563, ups=1.05, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=14400 epoch 009: 912 / 1689 loss=4.188, nll_loss=2.572, ppl=5.94, wps=457563, ups=1.05, wpb=433917, bsz=16627.3, num_updates=14400, lr=0.000527046, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=14400 epoch 009: 1012 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=455926, ups=1.05, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14495 epoch 009: 1012 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=455926, ups=1.05, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14495 epoch 009: 1012 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=455926, ups=1.05, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14495 epoch 009: 1012 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=455926, ups=1.05, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14495 epoch 009: 1012 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=455926, ups=1.05, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14495 epoch 009: 1012 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=455926, ups=1.05, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14495 epoch 009: 1012 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=455926, ups=1.05, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14495 epoch 009: 1012 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=455926, ups=1.05, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14495 epoch 009: 1012 / 1689 loss=4.179, nll_loss=2.562, ppl=5.91, wps=455926, ups=1.05, wpb=432730, bsz=16403.5, num_updates=14500, lr=0.000525226, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=14495 epoch 009: 1112 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=459196, ups=1.06, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=14590 epoch 009: 1112 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=459196, ups=1.06, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=14590 epoch 009: 1112 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=459196, ups=1.06, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=14590 epoch 009: 1112 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=459196, ups=1.06, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=14590 epoch 009: 1112 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=459196, ups=1.06, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=14590 epoch 009: 1112 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=459196, ups=1.06, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=14590 epoch 009: 1112 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=459196, ups=1.06, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=14590 epoch 009: 1112 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=459196, ups=1.06, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=14590 epoch 009: 1112 / 1689 loss=4.198, nll_loss=2.583, ppl=5.99, wps=459196, ups=1.06, wpb=434785, bsz=16629, num_updates=14600, lr=0.000523424, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=14590 epoch 009: 1212 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=456262, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14684 epoch 009: 1212 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=456262, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14684 epoch 009: 1212 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=456262, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14684 epoch 009: 1212 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=456262, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14684 epoch 009: 1212 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=456262, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14684 epoch 009: 1212 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=456262, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14684 epoch 009: 1212 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=456262, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14684 epoch 009: 1212 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=456262, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14684 epoch 009: 1212 / 1689 loss=4.182, nll_loss=2.565, ppl=5.92, wps=456262, ups=1.06, wpb=432385, bsz=16344.1, num_updates=14700, lr=0.000521641, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=14684 epoch 009: 1312 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=453561, ups=1.05, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=14780 epoch 009: 1312 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=453561, ups=1.05, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=14780 epoch 009: 1312 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=453561, ups=1.05, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=14780 epoch 009: 1312 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=453561, ups=1.05, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=14780 epoch 009: 1312 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=453561, ups=1.05, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=14780 epoch 009: 1312 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=453561, ups=1.05, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=14780 epoch 009: 1312 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=453561, ups=1.05, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=14780 epoch 009: 1312 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=453561, ups=1.05, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=14780 epoch 009: 1312 / 1689 loss=4.189, nll_loss=2.573, ppl=5.95, wps=453561, ups=1.05, wpb=431928, bsz=16268.7, num_updates=14800, lr=0.000519875, gnorm=0.224, clip=0, loss_scale=2, train_wall=94, gb_free=19.5, wall=14780 epoch 009: 1412 / 1689 loss=4.194, nll_loss=2.579, ppl=5.98, wps=454446, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=14875 epoch 009: 1412 / 1689 loss=4.194, nll_loss=2.579, ppl=5.98, wps=454446, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=14875 epoch 009: 1412 / 1689 loss=4.194, nll_loss=2.579, ppl=5.98, wps=454446, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=14875 epoch 009: 1412 / 1689 loss=4.194, nll_loss=2.579, ppl=5.98, wps=454446, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=14875 epoch 009: 1412 / 1689 loss=4.194, nll_loss=2.579, ppl=5.98, wps=454446, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=14875 epoch 009: 1412 / 1689 loss=4.194, nll_loss=2.579, ppl=5.98, wps=454446, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=14875 epoch 009: 1412 / 1689 loss=4.194, nll_loss=2.579, ppl=5.98, wps=454446, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=14875 epoch 009: 1412 / 1689 loss=4.194, nll_loss=2.579, ppl=5.98, wps=454446, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=14875 epoch 009: 1412 / 1689 loss=4.194, nll_loss=2.579, ppl=5.98, wps=454446, ups=1.05, wpb=434032, bsz=16623.4, num_updates=14900, lr=0.000518128, gnorm=0.227, clip=0, loss_scale=2, train_wall=94, gb_free=18, wall=14875 epoch 009: 1512 / 1689 loss=4.186, nll_loss=2.571, ppl=5.94, wps=457385, ups=1.06, wpb=433195, bsz=16575.8, num_updates=15000, lr=0.000516398, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=14970 epoch 009: 1512 / 1689 loss=4.186, nll_loss=2.571, ppl=5.94, wps=457385, ups=1.06, wpb=433195, bsz=16575.8, num_updates=15000, lr=0.000516398, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=14970 epoch 009: 1512 / 1689 loss=4.186, nll_loss=2.571, ppl=5.94, wps=457385, ups=1.06, wpb=433195, bsz=16575.8, num_updates=15000, lr=0.000516398, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=14970 epoch 009: 1512 / 1689 loss=4.186, nll_loss=2.571, ppl=5.94, wps=457385, ups=1.06, wpb=433195, bsz=16575.8, num_updates=15000, lr=0.000516398, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=14970 epoch 009: 1512 / 1689 loss=4.186, nll_loss=2.571, ppl=5.94, wps=457385, ups=1.06, wpb=433195, bsz=16575.8, num_updates=15000, lr=0.000516398, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=14970 epoch 009: 1512 / 1689 loss=4.186, nll_loss=2.571, ppl=5.94, wps=457385, ups=1.06, wpb=433195, bsz=16575.8, num_updates=15000, lr=0.000516398, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=14970 epoch 009: 1512 / 1689 loss=4.186, nll_loss=2.571, ppl=5.94, wps=457385, ups=1.06, wpb=433195, bsz=16575.8, num_updates=15000, lr=0.000516398, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=14970 epoch 009: 1512 / 1689 loss=4.186, nll_loss=2.571, ppl=5.94, wps=457385, ups=1.06, wpb=433195, bsz=16575.8, num_updates=15000, lr=0.000516398, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=14970 epoch 009: 1512 / 1689 loss=4.186, nll_loss=2.571, ppl=5.94, wps=457385, ups=1.06, wpb=433195, bsz=16575.8, num_updates=15000, lr=0.000516398, gnorm=0.226, clip=0, loss_scale=2, train_wall=93, gb_free=17.4, wall=14970 begin validation on "valid" subset epoch 009 | valid on 'valid' subset | loss 4.304 | nll_loss 2.684 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.304 epoch 009 | valid on 'valid' subset | loss 4.304 | nll_loss 2.684 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.304 epoch 009 | valid on 'valid' subset | loss 4.304 | nll_loss 2.684 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.304 epoch 009 | valid on 'valid' subset | loss 4.304 | nll_loss 2.684 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.304 epoch 009 | valid on 'valid' subset | loss 4.304 | nll_loss 2.684 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.304 epoch 009 | valid on 'valid' subset | loss 4.304 | nll_loss 2.684 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.304 epoch 009 | valid on 'valid' subset | loss 4.304 | nll_loss 2.684 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.304 epoch 009 | valid on 'valid' subset | loss 4.304 | nll_loss 2.684 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.304 epoch 009 | valid on 'valid' subset | loss 4.304 | nll_loss 2.684 | ppl 6.43 | wps 0 | wpb 42662 | bsz 2032 | num_updates 15000 | best_loss 4.304 epoch 009: 1612 / 1689 loss=4.208, nll_loss=2.595, ppl=6.04, wps=306534, ups=0.7, wpb=434870, bsz=16601.6, num_updates=15100, lr=0.000514685, gnorm=0.229, clip=0, loss_scale=2, train_wall=116, gb_free=18.8, wall=15112 epoch 009: 1612 / 1689 loss=4.208, nll_loss=2.595, ppl=6.04, wps=306534, ups=0.7, wpb=434870, bsz=16601.6, num_updates=15100, lr=0.000514685, gnorm=0.229, clip=0, loss_scale=2, train_wall=116, gb_free=18.8, wall=15112 epoch 009: 1612 / 1689 loss=4.208, nll_loss=2.595, ppl=6.04, wps=306534, ups=0.7, wpb=434870, bsz=16601.6, num_updates=15100, lr=0.000514685, gnorm=0.229, clip=0, loss_scale=2, train_wall=116, gb_free=18.8, wall=15112 epoch 009: 1612 / 1689 loss=4.208, nll_loss=2.595, ppl=6.04, wps=306534, ups=0.7, wpb=434870, bsz=16601.6, num_updates=15100, lr=0.000514685, gnorm=0.229, clip=0, loss_scale=2, train_wall=116, gb_free=18.8, wall=15112 epoch 009: 1612 / 1689 loss=4.208, nll_loss=2.595, ppl=6.04, wps=306534, ups=0.7, wpb=434870, bsz=16601.6, num_updates=15100, lr=0.000514685, gnorm=0.229, clip=0, loss_scale=2, train_wall=116, gb_free=18.8, wall=15112 epoch 009: 1612 / 1689 loss=4.208, nll_loss=2.595, ppl=6.04, wps=306534, ups=0.7, wpb=434870, bsz=16601.6, num_updates=15100, lr=0.000514685, gnorm=0.229, clip=0, loss_scale=2, train_wall=116, gb_free=18.8, wall=15112 epoch 009: 1612 / 1689 loss=4.208, nll_loss=2.595, ppl=6.04, wps=306534, ups=0.7, wpb=434870, bsz=16601.6, num_updates=15100, lr=0.000514685, gnorm=0.229, clip=0, loss_scale=2, train_wall=116, gb_free=18.8, wall=15112 epoch 009: 1612 / 1689 loss=4.208, nll_loss=2.595, ppl=6.04, wps=306534, ups=0.7, wpb=434870, bsz=16601.6, num_updates=15100, lr=0.000514685, gnorm=0.229, clip=0, loss_scale=2, train_wall=116, gb_free=18.8, wall=15112 epoch 009: 1612 / 1689 loss=4.208, nll_loss=2.595, ppl=6.04, wps=306534, ups=0.7, wpb=434870, bsz=16601.6, num_updates=15100, lr=0.000514685, gnorm=0.229, clip=0, loss_scale=2, train_wall=116, gb_free=18.8, wall=15112 end of epoch 9 (average epoch stats below) epoch 009 | loss 4.189 | nll_loss 2.572 | ppl 5.95 | wps 438269 | ups 1.01 | wpb 433540 | bsz 16501.9 | num_updates 15176 | lr 0.000513395 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 1595 | gb_free 20.3 | wall 15183 epoch 009 | loss 4.189 | nll_loss 2.572 | ppl 5.95 | wps 438269 | ups 1.01 | wpb 433540 | bsz 16501.9 | num_updates 15176 | lr 0.000513395 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 1595 | gb_free 20.3 | wall 15183 epoch 009 | loss 4.189 | nll_loss 2.572 | ppl 5.95 | wps 438269 | ups 1.01 | wpb 433540 | bsz 16501.9 | num_updates 15176 | lr 0.000513395 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 1595 | gb_free 20.3 | wall 15183 epoch 009 | loss 4.189 | nll_loss 2.572 | ppl 5.95 | wps 438269 | ups 1.01 | wpb 433540 | bsz 16501.9 | num_updates 15176 | lr 0.000513395 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 1595 | gb_free 20.3 | wall 15183 epoch 009 | loss 4.189 | nll_loss 2.572 | ppl 5.95 | wps 438269 | ups 1.01 | wpb 433540 | bsz 16501.9 | num_updates 15176 | lr 0.000513395 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 1595 | gb_free 20.3 | wall 15183 epoch 009 | loss 4.189 | nll_loss 2.572 | ppl 5.95 | wps 438269 | ups 1.01 | wpb 433540 | bsz 16501.9 | num_updates 15176 | lr 0.000513395 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 1595 | gb_free 20.3 | wall 15183 epoch 009 | loss 4.189 | nll_loss 2.572 | ppl 5.95 | wps 438269 | ups 1.01 | wpb 433540 | bsz 16501.9 | num_updates 15176 | lr 0.000513395 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 1595 | gb_free 20.3 | wall 15183 epoch 009 | loss 4.189 | nll_loss 2.572 | ppl 5.95 | wps 438269 | ups 1.01 | wpb 433540 | bsz 16501.9 | num_updates 15176 | lr 0.000513395 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 1595 | gb_free 20.3 | wall 15183 epoch 009 | loss 4.189 | nll_loss 2.572 | ppl 5.95 | wps 438269 | ups 1.01 | wpb 433540 | bsz 16501.9 | num_updates 15176 | lr 0.000513395 | gnorm 0.23 | clip 0 | loss_scale 1 | train_wall 1595 | gb_free 20.3 | wall 15183 Start iterating over samples epoch 010: 24 / 1689 loss=4.196, nll_loss=2.582, ppl=5.99, wps=452081, ups=1.05, wpb=431677, bsz=16667.3, num_updates=15200, lr=0.000512989, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=15207 epoch 010: 24 / 1689 loss=4.196, nll_loss=2.582, ppl=5.99, wps=452081, ups=1.05, wpb=431677, bsz=16667.3, num_updates=15200, lr=0.000512989, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=15207 epoch 010: 24 / 1689 loss=4.196, nll_loss=2.582, ppl=5.99, wps=452081, ups=1.05, wpb=431677, bsz=16667.3, num_updates=15200, lr=0.000512989, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=15207 epoch 010: 24 / 1689 loss=4.196, nll_loss=2.582, ppl=5.99, wps=452081, ups=1.05, wpb=431677, bsz=16667.3, num_updates=15200, lr=0.000512989, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=15207 epoch 010: 24 / 1689 loss=4.196, nll_loss=2.582, ppl=5.99, wps=452081, ups=1.05, wpb=431677, bsz=16667.3, num_updates=15200, lr=0.000512989, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=15207 epoch 010: 24 / 1689 loss=4.196, nll_loss=2.582, ppl=5.99, wps=452081, ups=1.05, wpb=431677, bsz=16667.3, num_updates=15200, lr=0.000512989, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=15207 epoch 010: 24 / 1689 loss=4.196, nll_loss=2.582, ppl=5.99, wps=452081, ups=1.05, wpb=431677, bsz=16667.3, num_updates=15200, lr=0.000512989, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=15207 epoch 010: 24 / 1689 loss=4.196, nll_loss=2.582, ppl=5.99, wps=452081, ups=1.05, wpb=431677, bsz=16667.3, num_updates=15200, lr=0.000512989, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=15207 epoch 010: 24 / 1689 loss=4.196, nll_loss=2.582, ppl=5.99, wps=452081, ups=1.05, wpb=431677, bsz=16667.3, num_updates=15200, lr=0.000512989, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=15207 epoch 010: 24 / 1689 loss=4.196, nll_loss=2.582, ppl=5.99, wps=452081, ups=1.05, wpb=431677, bsz=16667.3, num_updates=15200, lr=0.000512989, gnorm=0.244, clip=0, loss_scale=1, train_wall=93, gb_free=18.5, wall=15207 epoch 010: 124 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458315, ups=1.06, wpb=434212, bsz=16468.2, num_updates=15300, lr=0.00051131, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=15302 epoch 010: 124 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458315, ups=1.06, wpb=434212, bsz=16468.2, num_updates=15300, lr=0.00051131, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=15302 epoch 010: 124 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458315, ups=1.06, wpb=434212, bsz=16468.2, num_updates=15300, lr=0.00051131, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=15302 epoch 010: 124 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458315, ups=1.06, wpb=434212, bsz=16468.2, num_updates=15300, lr=0.00051131, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=15302 epoch 010: 124 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458315, ups=1.06, wpb=434212, bsz=16468.2, num_updates=15300, lr=0.00051131, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=15302 epoch 010: 124 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458315, ups=1.06, wpb=434212, bsz=16468.2, num_updates=15300, lr=0.00051131, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=15302 epoch 010: 124 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458315, ups=1.06, wpb=434212, bsz=16468.2, num_updates=15300, lr=0.00051131, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=15302 epoch 010: 124 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458315, ups=1.06, wpb=434212, bsz=16468.2, num_updates=15300, lr=0.00051131, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=15302 epoch 010: 124 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458315, ups=1.06, wpb=434212, bsz=16468.2, num_updates=15300, lr=0.00051131, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=15302 epoch 010: 124 / 1689 loss=4.15, nll_loss=2.528, ppl=5.77, wps=458315, ups=1.06, wpb=434212, bsz=16468.2, num_updates=15300, lr=0.00051131, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=15302 epoch 010: 224 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=457105, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=15397 epoch 010: 224 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=457105, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=15397 epoch 010: 224 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=457105, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=15397 epoch 010: 224 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=457105, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=15397 epoch 010: 224 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=457105, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=15397 epoch 010: 224 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=457105, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=15397 epoch 010: 224 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=457105, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=15397 epoch 010: 224 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=457105, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=15397 epoch 010: 224 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=457105, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=15397 epoch 010: 224 / 1689 loss=4.146, nll_loss=2.524, ppl=5.75, wps=457105, ups=1.06, wpb=433221, bsz=16646.4, num_updates=15400, lr=0.000509647, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20, wall=15397 epoch 010: 324 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=457517, ups=1.05, wpb=433709, bsz=16459.2, num_updates=15500, lr=0.000508001, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=15491 epoch 010: 324 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=457517, ups=1.05, wpb=433709, bsz=16459.2, num_updates=15500, lr=0.000508001, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=15491 epoch 010: 324 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=457517, ups=1.05, wpb=433709, bsz=16459.2, num_updates=15500, lr=0.000508001, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=15491 epoch 010: 324 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=457517, ups=1.05, wpb=433709, bsz=16459.2, num_updates=15500, lr=0.000508001, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=15491 epoch 010: 324 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=457517, ups=1.05, wpb=433709, bsz=16459.2, num_updates=15500, lr=0.000508001, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=15491 epoch 010: 324 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=457517, ups=1.05, wpb=433709, bsz=16459.2, num_updates=15500, lr=0.000508001, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=15491 epoch 010: 324 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=457517, ups=1.05, wpb=433709, bsz=16459.2, num_updates=15500, lr=0.000508001, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=15491 epoch 010: 324 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=457517, ups=1.05, wpb=433709, bsz=16459.2, num_updates=15500, lr=0.000508001, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=15491 epoch 010: 324 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=457517, ups=1.05, wpb=433709, bsz=16459.2, num_updates=15500, lr=0.000508001, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=15491 epoch 010: 324 / 1689 loss=4.154, nll_loss=2.533, ppl=5.79, wps=457517, ups=1.05, wpb=433709, bsz=16459.2, num_updates=15500, lr=0.000508001, gnorm=0.239, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=15491 epoch 010: 424 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=457034, ups=1.05, wpb=434945, bsz=16640, num_updates=15600, lr=0.00050637, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=15587 epoch 010: 424 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=457034, ups=1.05, wpb=434945, bsz=16640, num_updates=15600, lr=0.00050637, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=15587 epoch 010: 424 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=457034, ups=1.05, wpb=434945, bsz=16640, num_updates=15600, lr=0.00050637, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=15587 epoch 010: 424 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=457034, ups=1.05, wpb=434945, bsz=16640, num_updates=15600, lr=0.00050637, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=15587 epoch 010: 424 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=457034, ups=1.05, wpb=434945, bsz=16640, num_updates=15600, lr=0.00050637, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=15587 epoch 010: 424 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=457034, ups=1.05, wpb=434945, bsz=16640, num_updates=15600, lr=0.00050637, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=15587 epoch 010: 424 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=457034, ups=1.05, wpb=434945, bsz=16640, num_updates=15600, lr=0.00050637, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=15587 epoch 010: 424 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=457034, ups=1.05, wpb=434945, bsz=16640, num_updates=15600, lr=0.00050637, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=15587 epoch 010: 424 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=457034, ups=1.05, wpb=434945, bsz=16640, num_updates=15600, lr=0.00050637, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=15587 epoch 010: 424 / 1689 loss=4.154, nll_loss=2.534, ppl=5.79, wps=457034, ups=1.05, wpb=434945, bsz=16640, num_updates=15600, lr=0.00050637, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=15587 epoch 010: 525 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=454208, ups=1.05, wpb=431939, bsz=16298.7, num_updates=15700, lr=0.000504754, gnorm=0.238, clip=1, loss_scale=1, train_wall=93, gb_free=19.4, wall=15682 epoch 010: 525 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=454208, ups=1.05, wpb=431939, bsz=16298.7, num_updates=15700, lr=0.000504754, gnorm=0.238, clip=1, loss_scale=1, train_wall=93, gb_free=19.4, wall=15682 epoch 010: 525 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=454208, ups=1.05, wpb=431939, bsz=16298.7, num_updates=15700, lr=0.000504754, gnorm=0.238, clip=1, loss_scale=1, train_wall=93, gb_free=19.4, wall=15682 epoch 010: 525 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=454208, ups=1.05, wpb=431939, bsz=16298.7, num_updates=15700, lr=0.000504754, gnorm=0.238, clip=1, loss_scale=1, train_wall=93, gb_free=19.4, wall=15682 epoch 010: 525 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=454208, ups=1.05, wpb=431939, bsz=16298.7, num_updates=15700, lr=0.000504754, gnorm=0.238, clip=1, loss_scale=1, train_wall=93, gb_free=19.4, wall=15682 epoch 010: 525 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=454208, ups=1.05, wpb=431939, bsz=16298.7, num_updates=15700, lr=0.000504754, gnorm=0.238, clip=1, loss_scale=1, train_wall=93, gb_free=19.4, wall=15682 epoch 010: 525 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=454208, ups=1.05, wpb=431939, bsz=16298.7, num_updates=15700, lr=0.000504754, gnorm=0.238, clip=1, loss_scale=1, train_wall=93, gb_free=19.4, wall=15682 epoch 010: 525 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=454208, ups=1.05, wpb=431939, bsz=16298.7, num_updates=15700, lr=0.000504754, gnorm=0.238, clip=1, loss_scale=1, train_wall=93, gb_free=19.4, wall=15682 epoch 010: 525 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=454208, ups=1.05, wpb=431939, bsz=16298.7, num_updates=15700, lr=0.000504754, gnorm=0.238, clip=1, loss_scale=1, train_wall=93, gb_free=19.4, wall=15682 epoch 010: 525 / 1689 loss=4.157, nll_loss=2.537, ppl=5.8, wps=454208, ups=1.05, wpb=431939, bsz=16298.7, num_updates=15700, lr=0.000504754, gnorm=0.238, clip=1, loss_scale=1, train_wall=93, gb_free=19.4, wall=15682 epoch 010: 625 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=455342, ups=1.05, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.235, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=15777 epoch 010: 625 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=455342, ups=1.05, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.235, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=15777 epoch 010: 625 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=455342, ups=1.05, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.235, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=15777 epoch 010: 625 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=455342, ups=1.05, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.235, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=15777 epoch 010: 625 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=455342, ups=1.05, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.235, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=15777 epoch 010: 625 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=455342, ups=1.05, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.235, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=15777 epoch 010: 625 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=455342, ups=1.05, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.235, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=15777 epoch 010: 625 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=455342, ups=1.05, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.235, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=15777 epoch 010: 625 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=455342, ups=1.05, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.235, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=15777 epoch 010: 625 / 1689 loss=4.163, nll_loss=2.543, ppl=5.83, wps=455342, ups=1.05, wpb=432415, bsz=16846.2, num_updates=15800, lr=0.000503155, gnorm=0.235, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=15777 epoch 010: 725 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=457885, ups=1.06, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=15871 epoch 010: 725 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=457885, ups=1.06, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=15871 epoch 010: 725 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=457885, ups=1.06, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=15871 epoch 010: 725 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=457885, ups=1.06, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=15871 epoch 010: 725 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=457885, ups=1.06, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=15871 epoch 010: 725 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=457885, ups=1.06, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=15871 epoch 010: 725 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=457885, ups=1.06, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=15871 epoch 010: 725 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=457885, ups=1.06, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=15871 epoch 010: 725 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=457885, ups=1.06, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=15871 epoch 010: 725 / 1689 loss=4.158, nll_loss=2.538, ppl=5.81, wps=457885, ups=1.06, wpb=433586, bsz=16478.2, num_updates=15900, lr=0.00050157, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=15871 epoch 010: 825 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=456492, ups=1.05, wpb=435202, bsz=16412.7, num_updates=16000, lr=0.0005, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=15967 epoch 010: 825 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=456492, ups=1.05, wpb=435202, bsz=16412.7, num_updates=16000, lr=0.0005, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=15967 epoch 010: 825 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=456492, ups=1.05, wpb=435202, bsz=16412.7, num_updates=16000, lr=0.0005, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=15967 epoch 010: 825 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=456492, ups=1.05, wpb=435202, bsz=16412.7, num_updates=16000, lr=0.0005, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=15967 epoch 010: 825 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=456492, ups=1.05, wpb=435202, bsz=16412.7, num_updates=16000, lr=0.0005, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=15967 epoch 010: 825 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=456492, ups=1.05, wpb=435202, bsz=16412.7, num_updates=16000, lr=0.0005, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=15967 epoch 010: 825 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=456492, ups=1.05, wpb=435202, bsz=16412.7, num_updates=16000, lr=0.0005, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=15967 epoch 010: 825 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=456492, ups=1.05, wpb=435202, bsz=16412.7, num_updates=16000, lr=0.0005, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=15967 epoch 010: 825 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=456492, ups=1.05, wpb=435202, bsz=16412.7, num_updates=16000, lr=0.0005, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=15967 epoch 010: 825 / 1689 loss=4.159, nll_loss=2.539, ppl=5.81, wps=456492, ups=1.05, wpb=435202, bsz=16412.7, num_updates=16000, lr=0.0005, gnorm=0.22, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=15967 begin validation on "valid" subset epoch 010 | valid on 'valid' subset | loss 4.301 | nll_loss 2.68 | ppl 6.41 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.301 epoch 010 | valid on 'valid' subset | loss 4.301 | nll_loss 2.68 | ppl 6.41 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.301 epoch 010 | valid on 'valid' subset | loss 4.301 | nll_loss 2.68 | ppl 6.41 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.301 epoch 010 | valid on 'valid' subset | loss 4.301 | nll_loss 2.68 | ppl 6.41 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.301 epoch 010 | valid on 'valid' subset | loss 4.301 | nll_loss 2.68 | ppl 6.41 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.301 epoch 010 | valid on 'valid' subset | loss 4.301 | nll_loss 2.68 | ppl 6.41 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.301 epoch 010 | valid on 'valid' subset | loss 4.301 | nll_loss 2.68 | ppl 6.41 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.301 epoch 010 | valid on 'valid' subset | loss 4.301 | nll_loss 2.68 | ppl 6.41 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.301 epoch 010 | valid on 'valid' subset | loss 4.301 | nll_loss 2.68 | ppl 6.41 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.301 epoch 010 | valid on 'valid' subset | loss 4.301 | nll_loss 2.68 | ppl 6.41 | wps 0 | wpb 42662 | bsz 2032 | num_updates 16000 | best_loss 4.301 epoch 010: 926 / 1689 loss=4.172, nll_loss=2.555, ppl=5.88, wps=373550, ups=0.86, wpb=436258, bsz=16749.2, num_updates=16100, lr=0.000498445, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=16083 epoch 010: 926 / 1689 loss=4.172, nll_loss=2.555, ppl=5.88, wps=373550, ups=0.86, wpb=436258, bsz=16749.2, num_updates=16100, lr=0.000498445, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=16083 epoch 010: 926 / 1689 loss=4.172, nll_loss=2.555, ppl=5.88, wps=373550, ups=0.86, wpb=436258, bsz=16749.2, num_updates=16100, lr=0.000498445, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=16083 epoch 010: 926 / 1689 loss=4.172, nll_loss=2.555, ppl=5.88, wps=373550, ups=0.86, wpb=436258, bsz=16749.2, num_updates=16100, lr=0.000498445, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=16083 epoch 010: 926 / 1689 loss=4.172, nll_loss=2.555, ppl=5.88, wps=373550, ups=0.86, wpb=436258, bsz=16749.2, num_updates=16100, lr=0.000498445, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=16083 epoch 010: 926 / 1689 loss=4.172, nll_loss=2.555, ppl=5.88, wps=373550, ups=0.86, wpb=436258, bsz=16749.2, num_updates=16100, lr=0.000498445, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=16083 epoch 010: 926 / 1689 loss=4.172, nll_loss=2.555, ppl=5.88, wps=373550, ups=0.86, wpb=436258, bsz=16749.2, num_updates=16100, lr=0.000498445, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=16083 epoch 010: 926 / 1689 loss=4.172, nll_loss=2.555, ppl=5.88, wps=373550, ups=0.86, wpb=436258, bsz=16749.2, num_updates=16100, lr=0.000498445, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=16083 epoch 010: 926 / 1689 loss=4.172, nll_loss=2.555, ppl=5.88, wps=373550, ups=0.86, wpb=436258, bsz=16749.2, num_updates=16100, lr=0.000498445, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=16083 epoch 010: 926 / 1689 loss=4.172, nll_loss=2.555, ppl=5.88, wps=373550, ups=0.86, wpb=436258, bsz=16749.2, num_updates=16100, lr=0.000498445, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=16083 epoch 010: 1026 / 1689 loss=4.159, nll_loss=2.54, ppl=5.82, wps=461571, ups=1.07, wpb=433321, bsz=16603.8, num_updates=16200, lr=0.000496904, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=16177 epoch 010: 1026 / 1689 loss=4.159, nll_loss=2.54, ppl=5.82, wps=461571, ups=1.07, wpb=433321, bsz=16603.8, num_updates=16200, lr=0.000496904, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=16177 epoch 010: 1026 / 1689 loss=4.159, nll_loss=2.54, ppl=5.82, wps=461571, ups=1.07, wpb=433321, bsz=16603.8, num_updates=16200, lr=0.000496904, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=16177 epoch 010: 1026 / 1689 loss=4.159, nll_loss=2.54, ppl=5.82, wps=461571, ups=1.07, wpb=433321, bsz=16603.8, num_updates=16200, lr=0.000496904, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=16177 epoch 010: 1026 / 1689 loss=4.159, nll_loss=2.54, ppl=5.82, wps=461571, ups=1.07, wpb=433321, bsz=16603.8, num_updates=16200, lr=0.000496904, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=16177 epoch 010: 1026 / 1689 loss=4.159, nll_loss=2.54, ppl=5.82, wps=461571, ups=1.07, wpb=433321, bsz=16603.8, num_updates=16200, lr=0.000496904, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=16177 epoch 010: 1026 / 1689 loss=4.159, nll_loss=2.54, ppl=5.82, wps=461571, ups=1.07, wpb=433321, bsz=16603.8, num_updates=16200, lr=0.000496904, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=16177 epoch 010: 1026 / 1689 loss=4.159, nll_loss=2.54, ppl=5.82, wps=461571, ups=1.07, wpb=433321, bsz=16603.8, num_updates=16200, lr=0.000496904, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=16177 epoch 010: 1026 / 1689 loss=4.159, nll_loss=2.54, ppl=5.82, wps=461571, ups=1.07, wpb=433321, bsz=16603.8, num_updates=16200, lr=0.000496904, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=16177 epoch 010: 1026 / 1689 loss=4.159, nll_loss=2.54, ppl=5.82, wps=461571, ups=1.07, wpb=433321, bsz=16603.8, num_updates=16200, lr=0.000496904, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=92, gb_free=20, wall=16177 epoch 010: 1126 / 1689 loss=4.161, nll_loss=2.543, ppl=5.83, wps=459839, ups=1.07, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=16271 epoch 010: 1126 / 1689 loss=4.161, nll_loss=2.543, ppl=5.83, wps=459839, ups=1.07, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=16271 epoch 010: 1126 / 1689 loss=4.161, nll_loss=2.543, ppl=5.83, wps=459839, ups=1.07, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=16271 epoch 010: 1126 / 1689 loss=4.161, nll_loss=2.543, ppl=5.83, wps=459839, ups=1.07, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=16271 epoch 010: 1126 / 1689 loss=4.161, nll_loss=2.543, ppl=5.83, wps=459839, ups=1.07, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=16271 epoch 010: 1126 / 1689 loss=4.161, nll_loss=2.543, ppl=5.83, wps=459839, ups=1.07, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=16271 epoch 010: 1126 / 1689 loss=4.161, nll_loss=2.543, ppl=5.83, wps=459839, ups=1.07, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=16271 epoch 010: 1126 / 1689 loss=4.161, nll_loss=2.543, ppl=5.83, wps=459839, ups=1.07, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=16271 epoch 010: 1126 / 1689 loss=4.161, nll_loss=2.543, ppl=5.83, wps=459839, ups=1.07, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=16271 epoch 010: 1126 / 1689 loss=4.161, nll_loss=2.543, ppl=5.83, wps=459839, ups=1.07, wpb=431622, bsz=16502.6, num_updates=16300, lr=0.000495377, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=16271 epoch 010: 1226 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=461383, ups=1.06, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=16365 epoch 010: 1226 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=461383, ups=1.06, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=16365 epoch 010: 1226 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=461383, ups=1.06, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=16365 epoch 010: 1226 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=461383, ups=1.06, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=16365 epoch 010: 1226 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=461383, ups=1.06, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=16365 epoch 010: 1226 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=461383, ups=1.06, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=16365 epoch 010: 1226 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=461383, ups=1.06, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=16365 epoch 010: 1226 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=461383, ups=1.06, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=16365 epoch 010: 1226 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=461383, ups=1.06, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=16365 epoch 010: 1226 / 1689 loss=4.169, nll_loss=2.551, ppl=5.86, wps=461383, ups=1.06, wpb=433434, bsz=16477.4, num_updates=16400, lr=0.000493865, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=16365 epoch 010: 1326 / 1689 loss=4.159, nll_loss=2.541, ppl=5.82, wps=457576, ups=1.05, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=16460 epoch 010: 1326 / 1689 loss=4.159, nll_loss=2.541, ppl=5.82, wps=457576, ups=1.05, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=16460 epoch 010: 1326 / 1689 loss=4.159, nll_loss=2.541, ppl=5.82, wps=457576, ups=1.05, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=16460 epoch 010: 1326 / 1689 loss=4.159, nll_loss=2.541, ppl=5.82, wps=457576, ups=1.05, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=16460 epoch 010: 1326 / 1689 loss=4.159, nll_loss=2.541, ppl=5.82, wps=457576, ups=1.05, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=16460 epoch 010: 1326 / 1689 loss=4.159, nll_loss=2.541, ppl=5.82, wps=457576, ups=1.05, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=16460 epoch 010: 1326 / 1689 loss=4.159, nll_loss=2.541, ppl=5.82, wps=457576, ups=1.05, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=16460 epoch 010: 1326 / 1689 loss=4.159, nll_loss=2.541, ppl=5.82, wps=457576, ups=1.05, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=16460 epoch 010: 1326 / 1689 loss=4.159, nll_loss=2.541, ppl=5.82, wps=457576, ups=1.05, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=16460 epoch 010: 1326 / 1689 loss=4.159, nll_loss=2.541, ppl=5.82, wps=457576, ups=1.05, wpb=433845, bsz=16723, num_updates=16500, lr=0.000492366, gnorm=0.234, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=16460 epoch 010: 1426 / 1689 loss=4.181, nll_loss=2.566, ppl=5.92, wps=458745, ups=1.06, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=16555 epoch 010: 1426 / 1689 loss=4.181, nll_loss=2.566, ppl=5.92, wps=458745, ups=1.06, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=16555 epoch 010: 1426 / 1689 loss=4.181, nll_loss=2.566, ppl=5.92, wps=458745, ups=1.06, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=16555 epoch 010: 1426 / 1689 loss=4.181, nll_loss=2.566, ppl=5.92, wps=458745, ups=1.06, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=16555 epoch 010: 1426 / 1689 loss=4.181, nll_loss=2.566, ppl=5.92, wps=458745, ups=1.06, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=16555 epoch 010: 1426 / 1689 loss=4.181, nll_loss=2.566, ppl=5.92, wps=458745, ups=1.06, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=16555 epoch 010: 1426 / 1689 loss=4.181, nll_loss=2.566, ppl=5.92, wps=458745, ups=1.06, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=16555 epoch 010: 1426 / 1689 loss=4.181, nll_loss=2.566, ppl=5.92, wps=458745, ups=1.06, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=16555 epoch 010: 1426 / 1689 loss=4.181, nll_loss=2.566, ppl=5.92, wps=458745, ups=1.06, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=16555 epoch 010: 1426 / 1689 loss=4.181, nll_loss=2.566, ppl=5.92, wps=458745, ups=1.06, wpb=433874, bsz=16512.6, num_updates=16600, lr=0.000490881, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=16555 epoch 010: 1526 / 1689 loss=4.173, nll_loss=2.556, ppl=5.88, wps=457096, ups=1.06, wpb=432572, bsz=16288.2, num_updates=16700, lr=0.000489409, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16649 epoch 010: 1526 / 1689 loss=4.173, nll_loss=2.556, ppl=5.88, wps=457096, ups=1.06, wpb=432572, bsz=16288.2, num_updates=16700, lr=0.000489409, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16649 epoch 010: 1526 / 1689 loss=4.173, nll_loss=2.556, ppl=5.88, wps=457096, ups=1.06, wpb=432572, bsz=16288.2, num_updates=16700, lr=0.000489409, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16649 epoch 010: 1526 / 1689 loss=4.173, nll_loss=2.556, ppl=5.88, wps=457096, ups=1.06, wpb=432572, bsz=16288.2, num_updates=16700, lr=0.000489409, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16649 epoch 010: 1526 / 1689 loss=4.173, nll_loss=2.556, ppl=5.88, wps=457096, ups=1.06, wpb=432572, bsz=16288.2, num_updates=16700, lr=0.000489409, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16649 epoch 010: 1526 / 1689 loss=4.173, nll_loss=2.556, ppl=5.88, wps=457096, ups=1.06, wpb=432572, bsz=16288.2, num_updates=16700, lr=0.000489409, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16649 epoch 010: 1526 / 1689 loss=4.173, nll_loss=2.556, ppl=5.88, wps=457096, ups=1.06, wpb=432572, bsz=16288.2, num_updates=16700, lr=0.000489409, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16649 epoch 010: 1526 / 1689 loss=4.173, nll_loss=2.556, ppl=5.88, wps=457096, ups=1.06, wpb=432572, bsz=16288.2, num_updates=16700, lr=0.000489409, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16649 epoch 010: 1526 / 1689 loss=4.173, nll_loss=2.556, ppl=5.88, wps=457096, ups=1.06, wpb=432572, bsz=16288.2, num_updates=16700, lr=0.000489409, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16649 epoch 010: 1526 / 1689 loss=4.173, nll_loss=2.556, ppl=5.88, wps=457096, ups=1.06, wpb=432572, bsz=16288.2, num_updates=16700, lr=0.000489409, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16649 epoch 010: 1626 / 1689 loss=4.175, nll_loss=2.559, ppl=5.89, wps=455939, ups=1.05, wpb=434826, bsz=16347, num_updates=16800, lr=0.00048795, gnorm=0.236, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=16745 epoch 010: 1626 / 1689 loss=4.175, nll_loss=2.559, ppl=5.89, wps=455939, ups=1.05, wpb=434826, bsz=16347, num_updates=16800, lr=0.00048795, gnorm=0.236, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=16745 epoch 010: 1626 / 1689 loss=4.175, nll_loss=2.559, ppl=5.89, wps=455939, ups=1.05, wpb=434826, bsz=16347, num_updates=16800, lr=0.00048795, gnorm=0.236, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=16745 epoch 010: 1626 / 1689 loss=4.175, nll_loss=2.559, ppl=5.89, wps=455939, ups=1.05, wpb=434826, bsz=16347, num_updates=16800, lr=0.00048795, gnorm=0.236, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=16745 epoch 010: 1626 / 1689 loss=4.175, nll_loss=2.559, ppl=5.89, wps=455939, ups=1.05, wpb=434826, bsz=16347, num_updates=16800, lr=0.00048795, gnorm=0.236, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=16745 epoch 010: 1626 / 1689 loss=4.175, nll_loss=2.559, ppl=5.89, wps=455939, ups=1.05, wpb=434826, bsz=16347, num_updates=16800, lr=0.00048795, gnorm=0.236, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=16745 epoch 010: 1626 / 1689 loss=4.175, nll_loss=2.559, ppl=5.89, wps=455939, ups=1.05, wpb=434826, bsz=16347, num_updates=16800, lr=0.00048795, gnorm=0.236, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=16745 epoch 010: 1626 / 1689 loss=4.175, nll_loss=2.559, ppl=5.89, wps=455939, ups=1.05, wpb=434826, bsz=16347, num_updates=16800, lr=0.00048795, gnorm=0.236, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=16745 epoch 010: 1626 / 1689 loss=4.175, nll_loss=2.559, ppl=5.89, wps=455939, ups=1.05, wpb=434826, bsz=16347, num_updates=16800, lr=0.00048795, gnorm=0.236, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=16745 epoch 010: 1626 / 1689 loss=4.175, nll_loss=2.559, ppl=5.89, wps=455939, ups=1.05, wpb=434826, bsz=16347, num_updates=16800, lr=0.00048795, gnorm=0.236, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=16745 end of epoch 10 (average epoch stats below) epoch 010 | loss 4.162 | nll_loss 2.543 | ppl 5.83 | wps 451729 | ups 1.04 | wpb 433534 | bsz 16508.4 | num_updates 16863 | lr 0.000487038 | gnorm 0.228 | clip 0.1 | loss_scale 1 | train_wall 1568 | gb_free 21.2 | wall 16802 epoch 010 | loss 4.162 | nll_loss 2.543 | ppl 5.83 | wps 451729 | ups 1.04 | wpb 433534 | bsz 16508.4 | num_updates 16863 | lr 0.000487038 | gnorm 0.228 | clip 0.1 | loss_scale 1 | train_wall 1568 | gb_free 21.2 | wall 16802 epoch 010 | loss 4.162 | nll_loss 2.543 | ppl 5.83 | wps 451729 | ups 1.04 | wpb 433534 | bsz 16508.4 | num_updates 16863 | lr 0.000487038 | gnorm 0.228 | clip 0.1 | loss_scale 1 | train_wall 1568 | gb_free 21.2 | wall 16802 epoch 010 | loss 4.162 | nll_loss 2.543 | ppl 5.83 | wps 451729 | ups 1.04 | wpb 433534 | bsz 16508.4 | num_updates 16863 | lr 0.000487038 | gnorm 0.228 | clip 0.1 | loss_scale 1 | train_wall 1568 | gb_free 21.2 | wall 16802 epoch 010 | loss 4.162 | nll_loss 2.543 | ppl 5.83 | wps 451729 | ups 1.04 | wpb 433534 | bsz 16508.4 | num_updates 16863 | lr 0.000487038 | gnorm 0.228 | clip 0.1 | loss_scale 1 | train_wall 1568 | gb_free 21.2 | wall 16802 epoch 010 | loss 4.162 | nll_loss 2.543 | ppl 5.83 | wps 451729 | ups 1.04 | wpb 433534 | bsz 16508.4 | num_updates 16863 | lr 0.000487038 | gnorm 0.228 | clip 0.1 | loss_scale 1 | train_wall 1568 | gb_free 21.2 | wall 16802 epoch 010 | loss 4.162 | nll_loss 2.543 | ppl 5.83 | wps 451729 | ups 1.04 | wpb 433534 | bsz 16508.4 | num_updates 16863 | lr 0.000487038 | gnorm 0.228 | clip 0.1 | loss_scale 1 | train_wall 1568 | gb_free 21.2 | wall 16802 epoch 010 | loss 4.162 | nll_loss 2.543 | ppl 5.83 | wps 451729 | ups 1.04 | wpb 433534 | bsz 16508.4 | num_updates 16863 | lr 0.000487038 | gnorm 0.228 | clip 0.1 | loss_scale 1 | train_wall 1568 | gb_free 21.2 | wall 16802 epoch 010 | loss 4.162 | nll_loss 2.543 | ppl 5.83 | wps 451729 | ups 1.04 | wpb 433534 | bsz 16508.4 | num_updates 16863 | lr 0.000487038 | gnorm 0.228 | clip 0.1 | loss_scale 1 | train_wall 1568 | gb_free 21.2 | wall 16802 epoch 010 | loss 4.162 | nll_loss 2.543 | ppl 5.83 | wps 451729 | ups 1.04 | wpb 433534 | bsz 16508.4 | num_updates 16863 | lr 0.000487038 | gnorm 0.228 | clip 0.1 | loss_scale 1 | train_wall 1568 | gb_free 21.2 | wall 16802 Start iterating over samples epoch 011: 37 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=464166, ups=1.07, wpb=432577, bsz=16000.9, num_updates=16900, lr=0.000486504, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=16838 epoch 011: 37 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=464166, ups=1.07, wpb=432577, bsz=16000.9, num_updates=16900, lr=0.000486504, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=16838 epoch 011: 37 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=464166, ups=1.07, wpb=432577, bsz=16000.9, num_updates=16900, lr=0.000486504, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=16838 epoch 011: 37 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=464166, ups=1.07, wpb=432577, bsz=16000.9, num_updates=16900, lr=0.000486504, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=16838 epoch 011: 37 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=464166, ups=1.07, wpb=432577, bsz=16000.9, num_updates=16900, lr=0.000486504, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=16838 epoch 011: 37 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=464166, ups=1.07, wpb=432577, bsz=16000.9, num_updates=16900, lr=0.000486504, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=16838 epoch 011: 37 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=464166, ups=1.07, wpb=432577, bsz=16000.9, num_updates=16900, lr=0.000486504, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=16838 epoch 011: 37 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=464166, ups=1.07, wpb=432577, bsz=16000.9, num_updates=16900, lr=0.000486504, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=16838 epoch 011: 37 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=464166, ups=1.07, wpb=432577, bsz=16000.9, num_updates=16900, lr=0.000486504, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=16838 epoch 011: 37 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=464166, ups=1.07, wpb=432577, bsz=16000.9, num_updates=16900, lr=0.000486504, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=16838 epoch 011: 37 / 1689 loss=4.155, nll_loss=2.535, ppl=5.8, wps=464166, ups=1.07, wpb=432577, bsz=16000.9, num_updates=16900, lr=0.000486504, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.3, wall=16838 epoch 011: 137 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=462290, ups=1.07, wpb=434064, bsz=16778.5, num_updates=17000, lr=0.000485071, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16932 epoch 011: 137 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=462290, ups=1.07, wpb=434064, bsz=16778.5, num_updates=17000, lr=0.000485071, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16932 epoch 011: 137 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=462290, ups=1.07, wpb=434064, bsz=16778.5, num_updates=17000, lr=0.000485071, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16932 epoch 011: 137 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=462290, ups=1.07, wpb=434064, bsz=16778.5, num_updates=17000, lr=0.000485071, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16932 epoch 011: 137 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=462290, ups=1.07, wpb=434064, bsz=16778.5, num_updates=17000, lr=0.000485071, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16932 epoch 011: 137 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=462290, ups=1.07, wpb=434064, bsz=16778.5, num_updates=17000, lr=0.000485071, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16932 epoch 011: 137 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=462290, ups=1.07, wpb=434064, bsz=16778.5, num_updates=17000, lr=0.000485071, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16932 epoch 011: 137 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=462290, ups=1.07, wpb=434064, bsz=16778.5, num_updates=17000, lr=0.000485071, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16932 epoch 011: 137 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=462290, ups=1.07, wpb=434064, bsz=16778.5, num_updates=17000, lr=0.000485071, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16932 epoch 011: 137 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=462290, ups=1.07, wpb=434064, bsz=16778.5, num_updates=17000, lr=0.000485071, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16932 epoch 011: 137 / 1689 loss=4.116, nll_loss=2.491, ppl=5.62, wps=462290, ups=1.07, wpb=434064, bsz=16778.5, num_updates=17000, lr=0.000485071, gnorm=0.22, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=16932 begin validation on "valid" subset epoch 011 | valid on 'valid' subset | loss 4.304 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.301 epoch 011 | valid on 'valid' subset | loss 4.304 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.301 epoch 011 | valid on 'valid' subset | loss 4.304 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.301 epoch 011 | valid on 'valid' subset | loss 4.304 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.301 epoch 011 | valid on 'valid' subset | loss 4.304 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.301 epoch 011 | valid on 'valid' subset | loss 4.304 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.301 epoch 011 | valid on 'valid' subset | loss 4.304 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.301 epoch 011 | valid on 'valid' subset | loss 4.304 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.301 epoch 011 | valid on 'valid' subset | loss 4.304 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.301 epoch 011 | valid on 'valid' subset | loss 4.304 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.301 epoch 011 | valid on 'valid' subset | loss 4.304 | nll_loss 2.689 | ppl 6.45 | wps 0 | wpb 42662 | bsz 2032 | num_updates 17000 | best_loss 4.301 epoch 011: 237 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=407369, ups=0.95, wpb=430908, bsz=16463.3, num_updates=17100, lr=0.000483651, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=17037 epoch 011: 237 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=407369, ups=0.95, wpb=430908, bsz=16463.3, num_updates=17100, lr=0.000483651, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=17037 epoch 011: 237 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=407369, ups=0.95, wpb=430908, bsz=16463.3, num_updates=17100, lr=0.000483651, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=17037 epoch 011: 237 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=407369, ups=0.95, wpb=430908, bsz=16463.3, num_updates=17100, lr=0.000483651, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=17037 epoch 011: 237 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=407369, ups=0.95, wpb=430908, bsz=16463.3, num_updates=17100, lr=0.000483651, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=17037 epoch 011: 237 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=407369, ups=0.95, wpb=430908, bsz=16463.3, num_updates=17100, lr=0.000483651, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=17037 epoch 011: 237 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=407369, ups=0.95, wpb=430908, bsz=16463.3, num_updates=17100, lr=0.000483651, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=17037 epoch 011: 237 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=407369, ups=0.95, wpb=430908, bsz=16463.3, num_updates=17100, lr=0.000483651, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=17037 epoch 011: 237 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=407369, ups=0.95, wpb=430908, bsz=16463.3, num_updates=17100, lr=0.000483651, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=17037 epoch 011: 237 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=407369, ups=0.95, wpb=430908, bsz=16463.3, num_updates=17100, lr=0.000483651, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=17037 epoch 011: 237 / 1689 loss=4.123, nll_loss=2.499, ppl=5.65, wps=407369, ups=0.95, wpb=430908, bsz=16463.3, num_updates=17100, lr=0.000483651, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20.2, wall=17037 epoch 011: 337 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462882, ups=1.07, wpb=433334, bsz=16961.5, num_updates=17200, lr=0.000482243, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=17131 epoch 011: 337 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462882, ups=1.07, wpb=433334, bsz=16961.5, num_updates=17200, lr=0.000482243, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=17131 epoch 011: 337 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462882, ups=1.07, wpb=433334, bsz=16961.5, num_updates=17200, lr=0.000482243, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=17131 epoch 011: 337 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462882, ups=1.07, wpb=433334, bsz=16961.5, num_updates=17200, lr=0.000482243, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=17131 epoch 011: 337 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462882, ups=1.07, wpb=433334, bsz=16961.5, num_updates=17200, lr=0.000482243, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=17131 epoch 011: 337 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462882, ups=1.07, wpb=433334, bsz=16961.5, num_updates=17200, lr=0.000482243, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=17131 epoch 011: 337 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462882, ups=1.07, wpb=433334, bsz=16961.5, num_updates=17200, lr=0.000482243, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=17131 epoch 011: 337 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462882, ups=1.07, wpb=433334, bsz=16961.5, num_updates=17200, lr=0.000482243, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=17131 epoch 011: 337 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462882, ups=1.07, wpb=433334, bsz=16961.5, num_updates=17200, lr=0.000482243, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=17131 epoch 011: 337 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462882, ups=1.07, wpb=433334, bsz=16961.5, num_updates=17200, lr=0.000482243, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=17131 epoch 011: 337 / 1689 loss=4.13, nll_loss=2.506, ppl=5.68, wps=462882, ups=1.07, wpb=433334, bsz=16961.5, num_updates=17200, lr=0.000482243, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=16.2, wall=17131 epoch 011: 438 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=456309, ups=1.05, wpb=434771, bsz=16272.7, num_updates=17300, lr=0.000480847, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=17226 epoch 011: 438 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=456309, ups=1.05, wpb=434771, bsz=16272.7, num_updates=17300, lr=0.000480847, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=17226 epoch 011: 438 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=456309, ups=1.05, wpb=434771, bsz=16272.7, num_updates=17300, lr=0.000480847, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=17226 epoch 011: 438 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=456309, ups=1.05, wpb=434771, bsz=16272.7, num_updates=17300, lr=0.000480847, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=17226 epoch 011: 438 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=456309, ups=1.05, wpb=434771, bsz=16272.7, num_updates=17300, lr=0.000480847, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=17226 epoch 011: 438 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=456309, ups=1.05, wpb=434771, bsz=16272.7, num_updates=17300, lr=0.000480847, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=17226 epoch 011: 438 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=456309, ups=1.05, wpb=434771, bsz=16272.7, num_updates=17300, lr=0.000480847, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=17226 epoch 011: 438 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=456309, ups=1.05, wpb=434771, bsz=16272.7, num_updates=17300, lr=0.000480847, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=17226 epoch 011: 438 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=456309, ups=1.05, wpb=434771, bsz=16272.7, num_updates=17300, lr=0.000480847, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=17226 epoch 011: 438 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=456309, ups=1.05, wpb=434771, bsz=16272.7, num_updates=17300, lr=0.000480847, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=17226 epoch 011: 438 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=456309, ups=1.05, wpb=434771, bsz=16272.7, num_updates=17300, lr=0.000480847, gnorm=0.232, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=17226 epoch 011: 538 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=461041, ups=1.06, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.224, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=17321 epoch 011: 538 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=461041, ups=1.06, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.224, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=17321 epoch 011: 538 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=461041, ups=1.06, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.224, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=17321 epoch 011: 538 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=461041, ups=1.06, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.224, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=17321 epoch 011: 538 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=461041, ups=1.06, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.224, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=17321 epoch 011: 538 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=461041, ups=1.06, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.224, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=17321 epoch 011: 538 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=461041, ups=1.06, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.224, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=17321 epoch 011: 538 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=461041, ups=1.06, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.224, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=17321 epoch 011: 538 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=461041, ups=1.06, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.224, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=17321 epoch 011: 538 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=461041, ups=1.06, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.224, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=17321 epoch 011: 538 / 1689 loss=4.142, nll_loss=2.52, ppl=5.74, wps=461041, ups=1.06, wpb=436126, bsz=16657.2, num_updates=17400, lr=0.000479463, gnorm=0.224, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=17321 epoch 011: 638 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=463793, ups=1.06, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=17415 epoch 011: 638 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=463793, ups=1.06, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=17415 epoch 011: 638 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=463793, ups=1.06, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=17415 epoch 011: 638 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=463793, ups=1.06, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=17415 epoch 011: 638 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=463793, ups=1.06, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=17415 epoch 011: 638 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=463793, ups=1.06, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=17415 epoch 011: 638 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=463793, ups=1.06, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=17415 epoch 011: 638 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=463793, ups=1.06, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=17415 epoch 011: 638 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=463793, ups=1.06, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=17415 epoch 011: 638 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=463793, ups=1.06, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=17415 epoch 011: 638 / 1689 loss=4.14, nll_loss=2.518, ppl=5.73, wps=463793, ups=1.06, wpb=435633, bsz=16258.3, num_updates=17500, lr=0.000478091, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=17415 epoch 011: 738 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=459017, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=17509 epoch 011: 738 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=459017, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=17509 epoch 011: 738 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=459017, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=17509 epoch 011: 738 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=459017, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=17509 epoch 011: 738 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=459017, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=17509 epoch 011: 738 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=459017, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=17509 epoch 011: 738 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=459017, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=17509 epoch 011: 738 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=459017, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=17509 epoch 011: 738 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=459017, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=17509 epoch 011: 738 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=459017, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=17509 epoch 011: 738 / 1689 loss=4.149, nll_loss=2.529, ppl=5.77, wps=459017, ups=1.06, wpb=433250, bsz=16437.8, num_updates=17600, lr=0.000476731, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=17509 epoch 011: 838 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=457688, ups=1.05, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=17.4, wall=17604 epoch 011: 838 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=457688, ups=1.05, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=17.4, wall=17604 epoch 011: 838 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=457688, ups=1.05, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=17.4, wall=17604 epoch 011: 838 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=457688, ups=1.05, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=17.4, wall=17604 epoch 011: 838 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=457688, ups=1.05, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=17.4, wall=17604 epoch 011: 838 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=457688, ups=1.05, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=17.4, wall=17604 epoch 011: 838 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=457688, ups=1.05, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=17.4, wall=17604 epoch 011: 838 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=457688, ups=1.05, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=17.4, wall=17604 epoch 011: 838 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=457688, ups=1.05, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=17.4, wall=17604 epoch 011: 838 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=457688, ups=1.05, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=17.4, wall=17604 epoch 011: 838 / 1689 loss=4.138, nll_loss=2.517, ppl=5.72, wps=457688, ups=1.05, wpb=434853, bsz=16651.3, num_updates=17700, lr=0.000475383, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=17.4, wall=17604 epoch 011: 939 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=449999, ups=1.04, wpb=430788, bsz=16416.8, num_updates=17800, lr=0.000474045, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=17700 epoch 011: 939 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=449999, ups=1.04, wpb=430788, bsz=16416.8, num_updates=17800, lr=0.000474045, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=17700 epoch 011: 939 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=449999, ups=1.04, wpb=430788, bsz=16416.8, num_updates=17800, lr=0.000474045, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=17700 epoch 011: 939 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=449999, ups=1.04, wpb=430788, bsz=16416.8, num_updates=17800, lr=0.000474045, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=17700 epoch 011: 939 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=449999, ups=1.04, wpb=430788, bsz=16416.8, num_updates=17800, lr=0.000474045, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=17700 epoch 011: 939 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=449999, ups=1.04, wpb=430788, bsz=16416.8, num_updates=17800, lr=0.000474045, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=17700 epoch 011: 939 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=449999, ups=1.04, wpb=430788, bsz=16416.8, num_updates=17800, lr=0.000474045, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=17700 epoch 011: 939 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=449999, ups=1.04, wpb=430788, bsz=16416.8, num_updates=17800, lr=0.000474045, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=17700 epoch 011: 939 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=449999, ups=1.04, wpb=430788, bsz=16416.8, num_updates=17800, lr=0.000474045, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=17700 epoch 011: 939 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=449999, ups=1.04, wpb=430788, bsz=16416.8, num_updates=17800, lr=0.000474045, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=17700 epoch 011: 939 / 1689 loss=4.137, nll_loss=2.515, ppl=5.72, wps=449999, ups=1.04, wpb=430788, bsz=16416.8, num_updates=17800, lr=0.000474045, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=21.1, wall=17700 epoch 011: 1039 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=457847, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=17795 epoch 011: 1039 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=457847, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=17795 epoch 011: 1039 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=457847, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=17795 epoch 011: 1039 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=457847, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=17795 epoch 011: 1039 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=457847, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=17795 epoch 011: 1039 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=457847, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=17795 epoch 011: 1039 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=457847, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=17795 epoch 011: 1039 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=457847, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=17795 epoch 011: 1039 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=457847, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=17795 epoch 011: 1039 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=457847, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=17795 epoch 011: 1039 / 1689 loss=4.155, nll_loss=2.536, ppl=5.8, wps=457847, ups=1.05, wpb=434489, bsz=16716.7, num_updates=17900, lr=0.000472719, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=17795 epoch 011: 1140 / 1689 loss=4.142, nll_loss=2.521, ppl=5.74, wps=454927, ups=1.05, wpb=433558, bsz=16584.6, num_updates=18000, lr=0.000471405, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=17890 epoch 011: 1140 / 1689 loss=4.142, nll_loss=2.521, ppl=5.74, wps=454927, ups=1.05, wpb=433558, bsz=16584.6, num_updates=18000, lr=0.000471405, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=17890 epoch 011: 1140 / 1689 loss=4.142, nll_loss=2.521, ppl=5.74, wps=454927, ups=1.05, wpb=433558, bsz=16584.6, num_updates=18000, lr=0.000471405, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=17890 epoch 011: 1140 / 1689 loss=4.142, nll_loss=2.521, ppl=5.74, wps=454927, ups=1.05, wpb=433558, bsz=16584.6, num_updates=18000, lr=0.000471405, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=17890 epoch 011: 1140 / 1689 loss=4.142, nll_loss=2.521, ppl=5.74, wps=454927, ups=1.05, wpb=433558, bsz=16584.6, num_updates=18000, lr=0.000471405, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=17890 epoch 011: 1140 / 1689 loss=4.142, nll_loss=2.521, ppl=5.74, wps=454927, ups=1.05, wpb=433558, bsz=16584.6, num_updates=18000, lr=0.000471405, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=17890 epoch 011: 1140 / 1689 loss=4.142, nll_loss=2.521, ppl=5.74, wps=454927, ups=1.05, wpb=433558, bsz=16584.6, num_updates=18000, lr=0.000471405, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=17890 epoch 011: 1140 / 1689 loss=4.142, nll_loss=2.521, ppl=5.74, wps=454927, ups=1.05, wpb=433558, bsz=16584.6, num_updates=18000, lr=0.000471405, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=17890 epoch 011: 1140 / 1689 loss=4.142, nll_loss=2.521, ppl=5.74, wps=454927, ups=1.05, wpb=433558, bsz=16584.6, num_updates=18000, lr=0.000471405, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=17890 epoch 011: 1140 / 1689 loss=4.142, nll_loss=2.521, ppl=5.74, wps=454927, ups=1.05, wpb=433558, bsz=16584.6, num_updates=18000, lr=0.000471405, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=17890 epoch 011: 1140 / 1689 loss=4.142, nll_loss=2.521, ppl=5.74, wps=454927, ups=1.05, wpb=433558, bsz=16584.6, num_updates=18000, lr=0.000471405, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=17890 begin validation on "valid" subset epoch 011 | valid on 'valid' subset | loss 4.294 | nll_loss 2.679 | ppl 6.4 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.294 epoch 011 | valid on 'valid' subset | loss 4.294 | nll_loss 2.679 | ppl 6.4 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.294 epoch 011 | valid on 'valid' subset | loss 4.294 | nll_loss 2.679 | ppl 6.4 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.294 epoch 011 | valid on 'valid' subset | loss 4.294 | nll_loss 2.679 | ppl 6.4 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.294 epoch 011 | valid on 'valid' subset | loss 4.294 | nll_loss 2.679 | ppl 6.4 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.294 epoch 011 | valid on 'valid' subset | loss 4.294 | nll_loss 2.679 | ppl 6.4 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.294 epoch 011 | valid on 'valid' subset | loss 4.294 | nll_loss 2.679 | ppl 6.4 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.294 epoch 011 | valid on 'valid' subset | loss 4.294 | nll_loss 2.679 | ppl 6.4 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.294 epoch 011 | valid on 'valid' subset | loss 4.294 | nll_loss 2.679 | ppl 6.4 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.294 epoch 011 | valid on 'valid' subset | loss 4.294 | nll_loss 2.679 | ppl 6.4 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.294 epoch 011 | valid on 'valid' subset | loss 4.294 | nll_loss 2.679 | ppl 6.4 | wps 0 | wpb 42662 | bsz 2032 | num_updates 18000 | best_loss 4.294 epoch 011: 1240 / 1689 loss=4.136, nll_loss=2.514, ppl=5.71, wps=380189, ups=0.88, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=18004 epoch 011: 1240 / 1689 loss=4.136, nll_loss=2.514, ppl=5.71, wps=380189, ups=0.88, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=18004 epoch 011: 1240 / 1689 loss=4.136, nll_loss=2.514, ppl=5.71, wps=380189, ups=0.88, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=18004 epoch 011: 1240 / 1689 loss=4.136, nll_loss=2.514, ppl=5.71, wps=380189, ups=0.88, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=18004 epoch 011: 1240 / 1689 loss=4.136, nll_loss=2.514, ppl=5.71, wps=380189, ups=0.88, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=18004 epoch 011: 1240 / 1689 loss=4.136, nll_loss=2.514, ppl=5.71, wps=380189, ups=0.88, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=18004 epoch 011: 1240 / 1689 loss=4.136, nll_loss=2.514, ppl=5.71, wps=380189, ups=0.88, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=18004 epoch 011: 1240 / 1689 loss=4.136, nll_loss=2.514, ppl=5.71, wps=380189, ups=0.88, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=18004 epoch 011: 1240 / 1689 loss=4.136, nll_loss=2.514, ppl=5.71, wps=380189, ups=0.88, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=18004 epoch 011: 1240 / 1689 loss=4.136, nll_loss=2.514, ppl=5.71, wps=380189, ups=0.88, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=18004 epoch 011: 1240 / 1689 loss=4.136, nll_loss=2.514, ppl=5.71, wps=380189, ups=0.88, wpb=432680, bsz=16793, num_updates=18100, lr=0.0004701, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=18004 epoch 011: 1340 / 1689 loss=4.142, nll_loss=2.522, ppl=5.74, wps=458196, ups=1.06, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=18099 epoch 011: 1340 / 1689 loss=4.142, nll_loss=2.522, ppl=5.74, wps=458196, ups=1.06, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=18099 epoch 011: 1340 / 1689 loss=4.142, nll_loss=2.522, ppl=5.74, wps=458196, ups=1.06, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=18099 epoch 011: 1340 / 1689 loss=4.142, nll_loss=2.522, ppl=5.74, wps=458196, ups=1.06, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=18099 epoch 011: 1340 / 1689 loss=4.142, nll_loss=2.522, ppl=5.74, wps=458196, ups=1.06, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=18099 epoch 011: 1340 / 1689 loss=4.142, nll_loss=2.522, ppl=5.74, wps=458196, ups=1.06, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=18099 epoch 011: 1340 / 1689 loss=4.142, nll_loss=2.522, ppl=5.74, wps=458196, ups=1.06, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=18099 epoch 011: 1340 / 1689 loss=4.142, nll_loss=2.522, ppl=5.74, wps=458196, ups=1.06, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=18099 epoch 011: 1340 / 1689 loss=4.142, nll_loss=2.522, ppl=5.74, wps=458196, ups=1.06, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=18099 epoch 011: 1340 / 1689 loss=4.142, nll_loss=2.522, ppl=5.74, wps=458196, ups=1.06, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=18099 epoch 011: 1340 / 1689 loss=4.142, nll_loss=2.522, ppl=5.74, wps=458196, ups=1.06, wpb=433368, bsz=16455.6, num_updates=18200, lr=0.000468807, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=18099 epoch 011: 1440 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=459227, ups=1.06, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=18193 epoch 011: 1440 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=459227, ups=1.06, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=18193 epoch 011: 1440 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=459227, ups=1.06, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=18193 epoch 011: 1440 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=459227, ups=1.06, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=18193 epoch 011: 1440 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=459227, ups=1.06, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=18193 epoch 011: 1440 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=459227, ups=1.06, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=18193 epoch 011: 1440 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=459227, ups=1.06, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=18193 epoch 011: 1440 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=459227, ups=1.06, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=18193 epoch 011: 1440 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=459227, ups=1.06, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=18193 epoch 011: 1440 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=459227, ups=1.06, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=18193 epoch 011: 1440 / 1689 loss=4.151, nll_loss=2.531, ppl=5.78, wps=459227, ups=1.06, wpb=433421, bsz=16098.2, num_updates=18300, lr=0.000467525, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=18193 epoch 011: 1540 / 1689 loss=4.148, nll_loss=2.528, ppl=5.77, wps=457554, ups=1.05, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=18288 epoch 011: 1540 / 1689 loss=4.148, nll_loss=2.528, ppl=5.77, wps=457554, ups=1.05, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=18288 epoch 011: 1540 / 1689 loss=4.148, nll_loss=2.528, ppl=5.77, wps=457554, ups=1.05, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=18288 epoch 011: 1540 / 1689 loss=4.148, nll_loss=2.528, ppl=5.77, wps=457554, ups=1.05, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=18288 epoch 011: 1540 / 1689 loss=4.148, nll_loss=2.528, ppl=5.77, wps=457554, ups=1.05, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=18288 epoch 011: 1540 / 1689 loss=4.148, nll_loss=2.528, ppl=5.77, wps=457554, ups=1.05, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=18288 epoch 011: 1540 / 1689 loss=4.148, nll_loss=2.528, ppl=5.77, wps=457554, ups=1.05, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=18288 epoch 011: 1540 / 1689 loss=4.148, nll_loss=2.528, ppl=5.77, wps=457554, ups=1.05, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=18288 epoch 011: 1540 / 1689 loss=4.148, nll_loss=2.528, ppl=5.77, wps=457554, ups=1.05, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=18288 epoch 011: 1540 / 1689 loss=4.148, nll_loss=2.528, ppl=5.77, wps=457554, ups=1.05, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=18288 epoch 011: 1540 / 1689 loss=4.148, nll_loss=2.528, ppl=5.77, wps=457554, ups=1.05, wpb=434528, bsz=16072.9, num_updates=18400, lr=0.000466252, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=18288 epoch 011: 1640 / 1689 loss=4.147, nll_loss=2.527, ppl=5.77, wps=455669, ups=1.06, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=18383 epoch 011: 1640 / 1689 loss=4.147, nll_loss=2.527, ppl=5.77, wps=455669, ups=1.06, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=18383 epoch 011: 1640 / 1689 loss=4.147, nll_loss=2.527, ppl=5.77, wps=455669, ups=1.06, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=18383 epoch 011: 1640 / 1689 loss=4.147, nll_loss=2.527, ppl=5.77, wps=455669, ups=1.06, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=18383 epoch 011: 1640 / 1689 loss=4.147, nll_loss=2.527, ppl=5.77, wps=455669, ups=1.06, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=18383 epoch 011: 1640 / 1689 loss=4.147, nll_loss=2.527, ppl=5.77, wps=455669, ups=1.06, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=18383 epoch 011: 1640 / 1689 loss=4.147, nll_loss=2.527, ppl=5.77, wps=455669, ups=1.06, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=18383 epoch 011: 1640 / 1689 loss=4.147, nll_loss=2.527, ppl=5.77, wps=455669, ups=1.06, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=18383 epoch 011: 1640 / 1689 loss=4.147, nll_loss=2.527, ppl=5.77, wps=455669, ups=1.06, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=18383 epoch 011: 1640 / 1689 loss=4.147, nll_loss=2.527, ppl=5.77, wps=455669, ups=1.06, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=18383 epoch 011: 1640 / 1689 loss=4.147, nll_loss=2.527, ppl=5.77, wps=455669, ups=1.06, wpb=431871, bsz=16475.8, num_updates=18500, lr=0.000464991, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=18383 end of epoch 11 (average epoch stats below) epoch 011 | loss 4.14 | nll_loss 2.519 | ppl 5.73 | wps 449518 | ups 1.04 | wpb 433513 | bsz 16504.8 | num_updates 18549 | lr 0.000464376 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.8 | wall 18428 epoch 011 | loss 4.14 | nll_loss 2.519 | ppl 5.73 | wps 449518 | ups 1.04 | wpb 433513 | bsz 16504.8 | num_updates 18549 | lr 0.000464376 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.8 | wall 18428 epoch 011 | loss 4.14 | nll_loss 2.519 | ppl 5.73 | wps 449518 | ups 1.04 | wpb 433513 | bsz 16504.8 | num_updates 18549 | lr 0.000464376 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.8 | wall 18428 epoch 011 | loss 4.14 | nll_loss 2.519 | ppl 5.73 | wps 449518 | ups 1.04 | wpb 433513 | bsz 16504.8 | num_updates 18549 | lr 0.000464376 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.8 | wall 18428 epoch 011 | loss 4.14 | nll_loss 2.519 | ppl 5.73 | wps 449518 | ups 1.04 | wpb 433513 | bsz 16504.8 | num_updates 18549 | lr 0.000464376 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.8 | wall 18428 epoch 011 | loss 4.14 | nll_loss 2.519 | ppl 5.73 | wps 449518 | ups 1.04 | wpb 433513 | bsz 16504.8 | num_updates 18549 | lr 0.000464376 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.8 | wall 18428 epoch 011 | loss 4.14 | nll_loss 2.519 | ppl 5.73 | wps 449518 | ups 1.04 | wpb 433513 | bsz 16504.8 | num_updates 18549 | lr 0.000464376 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.8 | wall 18428 epoch 011 | loss 4.14 | nll_loss 2.519 | ppl 5.73 | wps 449518 | ups 1.04 | wpb 433513 | bsz 16504.8 | num_updates 18549 | lr 0.000464376 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.8 | wall 18428 epoch 011 | loss 4.14 | nll_loss 2.519 | ppl 5.73 | wps 449518 | ups 1.04 | wpb 433513 | bsz 16504.8 | num_updates 18549 | lr 0.000464376 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.8 | wall 18428 epoch 011 | loss 4.14 | nll_loss 2.519 | ppl 5.73 | wps 449518 | ups 1.04 | wpb 433513 | bsz 16504.8 | num_updates 18549 | lr 0.000464376 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.8 | wall 18428 epoch 011 | loss 4.14 | nll_loss 2.519 | ppl 5.73 | wps 449518 | ups 1.04 | wpb 433513 | bsz 16504.8 | num_updates 18549 | lr 0.000464376 | gnorm 0.222 | clip 0 | loss_scale 1 | train_wall 1572 | gb_free 20.8 | wall 18428 Start iterating over samples epoch 012: 51 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=449512, ups=1.05, wpb=429453, bsz=16349.7, num_updates=18600, lr=0.000463739, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=18478 epoch 012: 51 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=449512, ups=1.05, wpb=429453, bsz=16349.7, num_updates=18600, lr=0.000463739, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=18478 epoch 012: 51 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=449512, ups=1.05, wpb=429453, bsz=16349.7, num_updates=18600, lr=0.000463739, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=18478 epoch 012: 51 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=449512, ups=1.05, wpb=429453, bsz=16349.7, num_updates=18600, lr=0.000463739, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=18478 epoch 012: 51 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=449512, ups=1.05, wpb=429453, bsz=16349.7, num_updates=18600, lr=0.000463739, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=18478 epoch 012: 51 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=449512, ups=1.05, wpb=429453, bsz=16349.7, num_updates=18600, lr=0.000463739, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=18478 epoch 012: 51 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=449512, ups=1.05, wpb=429453, bsz=16349.7, num_updates=18600, lr=0.000463739, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=18478 epoch 012: 51 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=449512, ups=1.05, wpb=429453, bsz=16349.7, num_updates=18600, lr=0.000463739, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=18478 epoch 012: 51 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=449512, ups=1.05, wpb=429453, bsz=16349.7, num_updates=18600, lr=0.000463739, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=18478 epoch 012: 51 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=449512, ups=1.05, wpb=429453, bsz=16349.7, num_updates=18600, lr=0.000463739, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=18478 epoch 012: 51 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=449512, ups=1.05, wpb=429453, bsz=16349.7, num_updates=18600, lr=0.000463739, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=18478 epoch 012: 51 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=449512, ups=1.05, wpb=429453, bsz=16349.7, num_updates=18600, lr=0.000463739, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=18478 epoch 012: 151 / 1689 loss=4.097, nll_loss=2.47, ppl=5.54, wps=458491, ups=1.06, wpb=432769, bsz=16217.6, num_updates=18700, lr=0.000462497, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18573 epoch 012: 151 / 1689 loss=4.097, nll_loss=2.47, ppl=5.54, wps=458491, ups=1.06, wpb=432769, bsz=16217.6, num_updates=18700, lr=0.000462497, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18573 epoch 012: 151 / 1689 loss=4.097, nll_loss=2.47, ppl=5.54, wps=458491, ups=1.06, wpb=432769, bsz=16217.6, num_updates=18700, lr=0.000462497, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18573 epoch 012: 151 / 1689 loss=4.097, nll_loss=2.47, ppl=5.54, wps=458491, ups=1.06, wpb=432769, bsz=16217.6, num_updates=18700, lr=0.000462497, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18573 epoch 012: 151 / 1689 loss=4.097, nll_loss=2.47, ppl=5.54, wps=458491, ups=1.06, wpb=432769, bsz=16217.6, num_updates=18700, lr=0.000462497, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18573 epoch 012: 151 / 1689 loss=4.097, nll_loss=2.47, ppl=5.54, wps=458491, ups=1.06, wpb=432769, bsz=16217.6, num_updates=18700, lr=0.000462497, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18573 epoch 012: 151 / 1689 loss=4.097, nll_loss=2.47, ppl=5.54, wps=458491, ups=1.06, wpb=432769, bsz=16217.6, num_updates=18700, lr=0.000462497, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18573 epoch 012: 151 / 1689 loss=4.097, nll_loss=2.47, ppl=5.54, wps=458491, ups=1.06, wpb=432769, bsz=16217.6, num_updates=18700, lr=0.000462497, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18573 epoch 012: 151 / 1689 loss=4.097, nll_loss=2.47, ppl=5.54, wps=458491, ups=1.06, wpb=432769, bsz=16217.6, num_updates=18700, lr=0.000462497, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18573 epoch 012: 151 / 1689 loss=4.097, nll_loss=2.47, ppl=5.54, wps=458491, ups=1.06, wpb=432769, bsz=16217.6, num_updates=18700, lr=0.000462497, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18573 epoch 012: 151 / 1689 loss=4.097, nll_loss=2.47, ppl=5.54, wps=458491, ups=1.06, wpb=432769, bsz=16217.6, num_updates=18700, lr=0.000462497, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18573 epoch 012: 151 / 1689 loss=4.097, nll_loss=2.47, ppl=5.54, wps=458491, ups=1.06, wpb=432769, bsz=16217.6, num_updates=18700, lr=0.000462497, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=18573 epoch 012: 251 / 1689 loss=4.106, nll_loss=2.48, ppl=5.58, wps=458536, ups=1.06, wpb=433180, bsz=16767, num_updates=18800, lr=0.000461266, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=18667 epoch 012: 251 / 1689 loss=4.106, nll_loss=2.48, ppl=5.58, wps=458536, ups=1.06, wpb=433180, bsz=16767, num_updates=18800, lr=0.000461266, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=18667 epoch 012: 251 / 1689 loss=4.106, nll_loss=2.48, ppl=5.58, wps=458536, ups=1.06, wpb=433180, bsz=16767, num_updates=18800, lr=0.000461266, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=18667 epoch 012: 251 / 1689 loss=4.106, nll_loss=2.48, ppl=5.58, wps=458536, ups=1.06, wpb=433180, bsz=16767, num_updates=18800, lr=0.000461266, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=18667 epoch 012: 251 / 1689 loss=4.106, nll_loss=2.48, ppl=5.58, wps=458536, ups=1.06, wpb=433180, bsz=16767, num_updates=18800, lr=0.000461266, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=18667 epoch 012: 251 / 1689 loss=4.106, nll_loss=2.48, ppl=5.58, wps=458536, ups=1.06, wpb=433180, bsz=16767, num_updates=18800, lr=0.000461266, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=18667 epoch 012: 251 / 1689 loss=4.106, nll_loss=2.48, ppl=5.58, wps=458536, ups=1.06, wpb=433180, bsz=16767, num_updates=18800, lr=0.000461266, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=18667 epoch 012: 251 / 1689 loss=4.106, nll_loss=2.48, ppl=5.58, wps=458536, ups=1.06, wpb=433180, bsz=16767, num_updates=18800, lr=0.000461266, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=18667 epoch 012: 251 / 1689 loss=4.106, nll_loss=2.48, ppl=5.58, wps=458536, ups=1.06, wpb=433180, bsz=16767, num_updates=18800, lr=0.000461266, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=18667 epoch 012: 251 / 1689 loss=4.106, nll_loss=2.48, ppl=5.58, wps=458536, ups=1.06, wpb=433180, bsz=16767, num_updates=18800, lr=0.000461266, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=18667 epoch 012: 251 / 1689 loss=4.106, nll_loss=2.48, ppl=5.58, wps=458536, ups=1.06, wpb=433180, bsz=16767, num_updates=18800, lr=0.000461266, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=18667 epoch 012: 251 / 1689 loss=4.106, nll_loss=2.48, ppl=5.58, wps=458536, ups=1.06, wpb=433180, bsz=16767, num_updates=18800, lr=0.000461266, gnorm=0.232, clip=0, loss_scale=1, train_wall=93, gb_free=18.4, wall=18667 epoch 012: 351 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=458284, ups=1.06, wpb=433829, bsz=16244.6, num_updates=18900, lr=0.000460044, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=18762 epoch 012: 351 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=458284, ups=1.06, wpb=433829, bsz=16244.6, num_updates=18900, lr=0.000460044, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=18762 epoch 012: 351 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=458284, ups=1.06, wpb=433829, bsz=16244.6, num_updates=18900, lr=0.000460044, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=18762 epoch 012: 351 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=458284, ups=1.06, wpb=433829, bsz=16244.6, num_updates=18900, lr=0.000460044, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=18762 epoch 012: 351 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=458284, ups=1.06, wpb=433829, bsz=16244.6, num_updates=18900, lr=0.000460044, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=18762 epoch 012: 351 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=458284, ups=1.06, wpb=433829, bsz=16244.6, num_updates=18900, lr=0.000460044, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=18762 epoch 012: 351 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=458284, ups=1.06, wpb=433829, bsz=16244.6, num_updates=18900, lr=0.000460044, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=18762 epoch 012: 351 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=458284, ups=1.06, wpb=433829, bsz=16244.6, num_updates=18900, lr=0.000460044, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=18762 epoch 012: 351 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=458284, ups=1.06, wpb=433829, bsz=16244.6, num_updates=18900, lr=0.000460044, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=18762 epoch 012: 351 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=458284, ups=1.06, wpb=433829, bsz=16244.6, num_updates=18900, lr=0.000460044, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=18762 epoch 012: 351 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=458284, ups=1.06, wpb=433829, bsz=16244.6, num_updates=18900, lr=0.000460044, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=18762 epoch 012: 351 / 1689 loss=4.112, nll_loss=2.487, ppl=5.6, wps=458284, ups=1.06, wpb=433829, bsz=16244.6, num_updates=18900, lr=0.000460044, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=18762 epoch 012: 451 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=459902, ups=1.06, wpb=433731, bsz=16358.1, num_updates=19000, lr=0.000458831, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=18856 epoch 012: 451 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=459902, ups=1.06, wpb=433731, bsz=16358.1, num_updates=19000, lr=0.000458831, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=18856 epoch 012: 451 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=459902, ups=1.06, wpb=433731, bsz=16358.1, num_updates=19000, lr=0.000458831, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=18856 epoch 012: 451 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=459902, ups=1.06, wpb=433731, bsz=16358.1, num_updates=19000, lr=0.000458831, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=18856 epoch 012: 451 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=459902, ups=1.06, wpb=433731, bsz=16358.1, num_updates=19000, lr=0.000458831, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=18856 epoch 012: 451 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=459902, ups=1.06, wpb=433731, bsz=16358.1, num_updates=19000, lr=0.000458831, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=18856 epoch 012: 451 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=459902, ups=1.06, wpb=433731, bsz=16358.1, num_updates=19000, lr=0.000458831, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=18856 epoch 012: 451 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=459902, ups=1.06, wpb=433731, bsz=16358.1, num_updates=19000, lr=0.000458831, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=18856 epoch 012: 451 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=459902, ups=1.06, wpb=433731, bsz=16358.1, num_updates=19000, lr=0.000458831, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=18856 epoch 012: 451 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=459902, ups=1.06, wpb=433731, bsz=16358.1, num_updates=19000, lr=0.000458831, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=18856 epoch 012: 451 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=459902, ups=1.06, wpb=433731, bsz=16358.1, num_updates=19000, lr=0.000458831, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=18856 epoch 012: 451 / 1689 loss=4.116, nll_loss=2.492, ppl=5.62, wps=459902, ups=1.06, wpb=433731, bsz=16358.1, num_updates=19000, lr=0.000458831, gnorm=0.223, clip=0, loss_scale=2, train_wall=93, gb_free=19.4, wall=18856 begin validation on "valid" subset epoch 012 | valid on 'valid' subset | loss 4.285 | nll_loss 2.667 | ppl 6.35 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.285 | nll_loss 2.667 | ppl 6.35 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.285 | nll_loss 2.667 | ppl 6.35 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.285 | nll_loss 2.667 | ppl 6.35 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.285 | nll_loss 2.667 | ppl 6.35 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.285 | nll_loss 2.667 | ppl 6.35 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.285 | nll_loss 2.667 | ppl 6.35 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.285 | nll_loss 2.667 | ppl 6.35 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.285 | nll_loss 2.667 | ppl 6.35 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.285 | nll_loss 2.667 | ppl 6.35 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.285 | nll_loss 2.667 | ppl 6.35 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.285 | nll_loss 2.667 | ppl 6.35 | wps 0 | wpb 42662 | bsz 2032 | num_updates 19000 | best_loss 4.285 epoch 012: 551 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=312420, ups=0.72, wpb=431479, bsz=16609.1, num_updates=19100, lr=0.000457629, gnorm=0.219, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=18994 epoch 012: 551 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=312420, ups=0.72, wpb=431479, bsz=16609.1, num_updates=19100, lr=0.000457629, gnorm=0.219, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=18994 epoch 012: 551 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=312420, ups=0.72, wpb=431479, bsz=16609.1, num_updates=19100, lr=0.000457629, gnorm=0.219, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=18994 epoch 012: 551 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=312420, ups=0.72, wpb=431479, bsz=16609.1, num_updates=19100, lr=0.000457629, gnorm=0.219, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=18994 epoch 012: 551 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=312420, ups=0.72, wpb=431479, bsz=16609.1, num_updates=19100, lr=0.000457629, gnorm=0.219, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=18994 epoch 012: 551 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=312420, ups=0.72, wpb=431479, bsz=16609.1, num_updates=19100, lr=0.000457629, gnorm=0.219, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=18994 epoch 012: 551 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=312420, ups=0.72, wpb=431479, bsz=16609.1, num_updates=19100, lr=0.000457629, gnorm=0.219, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=18994 epoch 012: 551 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=312420, ups=0.72, wpb=431479, bsz=16609.1, num_updates=19100, lr=0.000457629, gnorm=0.219, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=18994 epoch 012: 551 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=312420, ups=0.72, wpb=431479, bsz=16609.1, num_updates=19100, lr=0.000457629, gnorm=0.219, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=18994 epoch 012: 551 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=312420, ups=0.72, wpb=431479, bsz=16609.1, num_updates=19100, lr=0.000457629, gnorm=0.219, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=18994 epoch 012: 551 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=312420, ups=0.72, wpb=431479, bsz=16609.1, num_updates=19100, lr=0.000457629, gnorm=0.219, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=18994 epoch 012: 551 / 1689 loss=4.115, nll_loss=2.49, ppl=5.62, wps=312420, ups=0.72, wpb=431479, bsz=16609.1, num_updates=19100, lr=0.000457629, gnorm=0.219, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=18994 epoch 012: 652 / 1689 loss=4.125, nll_loss=2.502, ppl=5.66, wps=464465, ups=1.07, wpb=434369, bsz=16629.9, num_updates=19200, lr=0.000456435, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=19088 epoch 012: 652 / 1689 loss=4.125, nll_loss=2.502, ppl=5.66, wps=464465, ups=1.07, wpb=434369, bsz=16629.9, num_updates=19200, lr=0.000456435, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=19088 epoch 012: 652 / 1689 loss=4.125, nll_loss=2.502, ppl=5.66, wps=464465, ups=1.07, wpb=434369, bsz=16629.9, num_updates=19200, lr=0.000456435, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=19088 epoch 012: 652 / 1689 loss=4.125, nll_loss=2.502, ppl=5.66, wps=464465, ups=1.07, wpb=434369, bsz=16629.9, num_updates=19200, lr=0.000456435, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=19088 epoch 012: 652 / 1689 loss=4.125, nll_loss=2.502, ppl=5.66, wps=464465, ups=1.07, wpb=434369, bsz=16629.9, num_updates=19200, lr=0.000456435, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=19088 epoch 012: 652 / 1689 loss=4.125, nll_loss=2.502, ppl=5.66, wps=464465, ups=1.07, wpb=434369, bsz=16629.9, num_updates=19200, lr=0.000456435, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=19088 epoch 012: 652 / 1689 loss=4.125, nll_loss=2.502, ppl=5.66, wps=464465, ups=1.07, wpb=434369, bsz=16629.9, num_updates=19200, lr=0.000456435, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=19088 epoch 012: 652 / 1689 loss=4.125, nll_loss=2.502, ppl=5.66, wps=464465, ups=1.07, wpb=434369, bsz=16629.9, num_updates=19200, lr=0.000456435, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=19088 epoch 012: 652 / 1689 loss=4.125, nll_loss=2.502, ppl=5.66, wps=464465, ups=1.07, wpb=434369, bsz=16629.9, num_updates=19200, lr=0.000456435, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=19088 epoch 012: 652 / 1689 loss=4.125, nll_loss=2.502, ppl=5.66, wps=464465, ups=1.07, wpb=434369, bsz=16629.9, num_updates=19200, lr=0.000456435, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=19088 epoch 012: 652 / 1689 loss=4.125, nll_loss=2.502, ppl=5.66, wps=464465, ups=1.07, wpb=434369, bsz=16629.9, num_updates=19200, lr=0.000456435, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=19088 epoch 012: 652 / 1689 loss=4.125, nll_loss=2.502, ppl=5.66, wps=464465, ups=1.07, wpb=434369, bsz=16629.9, num_updates=19200, lr=0.000456435, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=19088 epoch 012: 752 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=465793, ups=1.08, wpb=432662, bsz=16682.6, num_updates=19300, lr=0.000455251, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=19181 epoch 012: 752 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=465793, ups=1.08, wpb=432662, bsz=16682.6, num_updates=19300, lr=0.000455251, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=19181 epoch 012: 752 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=465793, ups=1.08, wpb=432662, bsz=16682.6, num_updates=19300, lr=0.000455251, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=19181 epoch 012: 752 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=465793, ups=1.08, wpb=432662, bsz=16682.6, num_updates=19300, lr=0.000455251, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=19181 epoch 012: 752 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=465793, ups=1.08, wpb=432662, bsz=16682.6, num_updates=19300, lr=0.000455251, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=19181 epoch 012: 752 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=465793, ups=1.08, wpb=432662, bsz=16682.6, num_updates=19300, lr=0.000455251, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=19181 epoch 012: 752 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=465793, ups=1.08, wpb=432662, bsz=16682.6, num_updates=19300, lr=0.000455251, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=19181 epoch 012: 752 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=465793, ups=1.08, wpb=432662, bsz=16682.6, num_updates=19300, lr=0.000455251, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=19181 epoch 012: 752 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=465793, ups=1.08, wpb=432662, bsz=16682.6, num_updates=19300, lr=0.000455251, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=19181 epoch 012: 752 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=465793, ups=1.08, wpb=432662, bsz=16682.6, num_updates=19300, lr=0.000455251, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=19181 epoch 012: 752 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=465793, ups=1.08, wpb=432662, bsz=16682.6, num_updates=19300, lr=0.000455251, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=19181 epoch 012: 752 / 1689 loss=4.12, nll_loss=2.496, ppl=5.64, wps=465793, ups=1.08, wpb=432662, bsz=16682.6, num_updates=19300, lr=0.000455251, gnorm=0.23, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=19181 epoch 012: 852 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=464103, ups=1.07, wpb=433866, bsz=16764.6, num_updates=19400, lr=0.000454077, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19274 epoch 012: 852 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=464103, ups=1.07, wpb=433866, bsz=16764.6, num_updates=19400, lr=0.000454077, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19274 epoch 012: 852 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=464103, ups=1.07, wpb=433866, bsz=16764.6, num_updates=19400, lr=0.000454077, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19274 epoch 012: 852 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=464103, ups=1.07, wpb=433866, bsz=16764.6, num_updates=19400, lr=0.000454077, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19274 epoch 012: 852 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=464103, ups=1.07, wpb=433866, bsz=16764.6, num_updates=19400, lr=0.000454077, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19274 epoch 012: 852 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=464103, ups=1.07, wpb=433866, bsz=16764.6, num_updates=19400, lr=0.000454077, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19274 epoch 012: 852 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=464103, ups=1.07, wpb=433866, bsz=16764.6, num_updates=19400, lr=0.000454077, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19274 epoch 012: 852 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=464103, ups=1.07, wpb=433866, bsz=16764.6, num_updates=19400, lr=0.000454077, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19274 epoch 012: 852 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=464103, ups=1.07, wpb=433866, bsz=16764.6, num_updates=19400, lr=0.000454077, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19274 epoch 012: 852 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=464103, ups=1.07, wpb=433866, bsz=16764.6, num_updates=19400, lr=0.000454077, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19274 epoch 012: 852 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=464103, ups=1.07, wpb=433866, bsz=16764.6, num_updates=19400, lr=0.000454077, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19274 epoch 012: 852 / 1689 loss=4.123, nll_loss=2.5, ppl=5.66, wps=464103, ups=1.07, wpb=433866, bsz=16764.6, num_updates=19400, lr=0.000454077, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19274 epoch 012: 952 / 1689 loss=4.115, nll_loss=2.492, ppl=5.62, wps=465406, ups=1.06, wpb=437497, bsz=16662.6, num_updates=19500, lr=0.000452911, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19368 epoch 012: 952 / 1689 loss=4.115, nll_loss=2.492, ppl=5.62, wps=465406, ups=1.06, wpb=437497, bsz=16662.6, num_updates=19500, lr=0.000452911, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19368 epoch 012: 952 / 1689 loss=4.115, nll_loss=2.492, ppl=5.62, wps=465406, ups=1.06, wpb=437497, bsz=16662.6, num_updates=19500, lr=0.000452911, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19368 epoch 012: 952 / 1689 loss=4.115, nll_loss=2.492, ppl=5.62, wps=465406, ups=1.06, wpb=437497, bsz=16662.6, num_updates=19500, lr=0.000452911, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19368 epoch 012: 952 / 1689 loss=4.115, nll_loss=2.492, ppl=5.62, wps=465406, ups=1.06, wpb=437497, bsz=16662.6, num_updates=19500, lr=0.000452911, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19368 epoch 012: 952 / 1689 loss=4.115, nll_loss=2.492, ppl=5.62, wps=465406, ups=1.06, wpb=437497, bsz=16662.6, num_updates=19500, lr=0.000452911, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19368 epoch 012: 952 / 1689 loss=4.115, nll_loss=2.492, ppl=5.62, wps=465406, ups=1.06, wpb=437497, bsz=16662.6, num_updates=19500, lr=0.000452911, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19368 epoch 012: 952 / 1689 loss=4.115, nll_loss=2.492, ppl=5.62, wps=465406, ups=1.06, wpb=437497, bsz=16662.6, num_updates=19500, lr=0.000452911, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19368 epoch 012: 952 / 1689 loss=4.115, nll_loss=2.492, ppl=5.62, wps=465406, ups=1.06, wpb=437497, bsz=16662.6, num_updates=19500, lr=0.000452911, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19368 epoch 012: 952 / 1689 loss=4.115, nll_loss=2.492, ppl=5.62, wps=465406, ups=1.06, wpb=437497, bsz=16662.6, num_updates=19500, lr=0.000452911, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19368 epoch 012: 952 / 1689 loss=4.115, nll_loss=2.492, ppl=5.62, wps=465406, ups=1.06, wpb=437497, bsz=16662.6, num_updates=19500, lr=0.000452911, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19368 epoch 012: 952 / 1689 loss=4.115, nll_loss=2.492, ppl=5.62, wps=465406, ups=1.06, wpb=437497, bsz=16662.6, num_updates=19500, lr=0.000452911, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=19368 epoch 012: 1052 / 1689 loss=4.119, nll_loss=2.496, ppl=5.64, wps=463852, ups=1.07, wpb=433131, bsz=16088.7, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=19462 epoch 012: 1052 / 1689 loss=4.119, nll_loss=2.496, ppl=5.64, wps=463852, ups=1.07, wpb=433131, bsz=16088.7, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=19462 epoch 012: 1052 / 1689 loss=4.119, nll_loss=2.496, ppl=5.64, wps=463852, ups=1.07, wpb=433131, bsz=16088.7, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=19462 epoch 012: 1052 / 1689 loss=4.119, nll_loss=2.496, ppl=5.64, wps=463852, ups=1.07, wpb=433131, bsz=16088.7, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=19462 epoch 012: 1052 / 1689 loss=4.119, nll_loss=2.496, ppl=5.64, wps=463852, ups=1.07, wpb=433131, bsz=16088.7, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=19462 epoch 012: 1052 / 1689 loss=4.119, nll_loss=2.496, ppl=5.64, wps=463852, ups=1.07, wpb=433131, bsz=16088.7, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=19462 epoch 012: 1052 / 1689 loss=4.119, nll_loss=2.496, ppl=5.64, wps=463852, ups=1.07, wpb=433131, bsz=16088.7, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=19462 epoch 012: 1052 / 1689 loss=4.119, nll_loss=2.496, ppl=5.64, wps=463852, ups=1.07, wpb=433131, bsz=16088.7, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=19462 epoch 012: 1052 / 1689 loss=4.119, nll_loss=2.496, ppl=5.64, wps=463852, ups=1.07, wpb=433131, bsz=16088.7, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=19462 epoch 012: 1052 / 1689 loss=4.119, nll_loss=2.496, ppl=5.64, wps=463852, ups=1.07, wpb=433131, bsz=16088.7, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=19462 epoch 012: 1052 / 1689 loss=4.119, nll_loss=2.496, ppl=5.64, wps=463852, ups=1.07, wpb=433131, bsz=16088.7, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=19462 epoch 012: 1052 / 1689 loss=4.119, nll_loss=2.496, ppl=5.64, wps=463852, ups=1.07, wpb=433131, bsz=16088.7, num_updates=19600, lr=0.000451754, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=19462 epoch 012: 1152 / 1689 loss=4.126, nll_loss=2.503, ppl=5.67, wps=460244, ups=1.06, wpb=433596, bsz=16556.3, num_updates=19700, lr=0.000450606, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=19556 epoch 012: 1152 / 1689 loss=4.126, nll_loss=2.503, ppl=5.67, wps=460244, ups=1.06, wpb=433596, bsz=16556.3, num_updates=19700, lr=0.000450606, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=19556 epoch 012: 1152 / 1689 loss=4.126, nll_loss=2.503, ppl=5.67, wps=460244, ups=1.06, wpb=433596, bsz=16556.3, num_updates=19700, lr=0.000450606, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=19556 epoch 012: 1152 / 1689 loss=4.126, nll_loss=2.503, ppl=5.67, wps=460244, ups=1.06, wpb=433596, bsz=16556.3, num_updates=19700, lr=0.000450606, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=19556 epoch 012: 1152 / 1689 loss=4.126, nll_loss=2.503, ppl=5.67, wps=460244, ups=1.06, wpb=433596, bsz=16556.3, num_updates=19700, lr=0.000450606, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=19556 epoch 012: 1152 / 1689 loss=4.126, nll_loss=2.503, ppl=5.67, wps=460244, ups=1.06, wpb=433596, bsz=16556.3, num_updates=19700, lr=0.000450606, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=19556 epoch 012: 1152 / 1689 loss=4.126, nll_loss=2.503, ppl=5.67, wps=460244, ups=1.06, wpb=433596, bsz=16556.3, num_updates=19700, lr=0.000450606, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=19556 epoch 012: 1152 / 1689 loss=4.126, nll_loss=2.503, ppl=5.67, wps=460244, ups=1.06, wpb=433596, bsz=16556.3, num_updates=19700, lr=0.000450606, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=19556 epoch 012: 1152 / 1689 loss=4.126, nll_loss=2.503, ppl=5.67, wps=460244, ups=1.06, wpb=433596, bsz=16556.3, num_updates=19700, lr=0.000450606, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=19556 epoch 012: 1152 / 1689 loss=4.126, nll_loss=2.503, ppl=5.67, wps=460244, ups=1.06, wpb=433596, bsz=16556.3, num_updates=19700, lr=0.000450606, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=19556 epoch 012: 1152 / 1689 loss=4.126, nll_loss=2.503, ppl=5.67, wps=460244, ups=1.06, wpb=433596, bsz=16556.3, num_updates=19700, lr=0.000450606, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=19556 epoch 012: 1152 / 1689 loss=4.126, nll_loss=2.503, ppl=5.67, wps=460244, ups=1.06, wpb=433596, bsz=16556.3, num_updates=19700, lr=0.000450606, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=18.5, wall=19556 epoch 012: 1253 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=453093, ups=1.04, wpb=434494, bsz=16664.5, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=1, train_wall=95, gb_free=19.6, wall=19652 epoch 012: 1253 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=453093, ups=1.04, wpb=434494, bsz=16664.5, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=1, train_wall=95, gb_free=19.6, wall=19652 epoch 012: 1253 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=453093, ups=1.04, wpb=434494, bsz=16664.5, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=1, train_wall=95, gb_free=19.6, wall=19652 epoch 012: 1253 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=453093, ups=1.04, wpb=434494, bsz=16664.5, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=1, train_wall=95, gb_free=19.6, wall=19652 epoch 012: 1253 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=453093, ups=1.04, wpb=434494, bsz=16664.5, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=1, train_wall=95, gb_free=19.6, wall=19652 epoch 012: 1253 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=453093, ups=1.04, wpb=434494, bsz=16664.5, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=1, train_wall=95, gb_free=19.6, wall=19652 epoch 012: 1253 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=453093, ups=1.04, wpb=434494, bsz=16664.5, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=1, train_wall=95, gb_free=19.6, wall=19652 epoch 012: 1253 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=453093, ups=1.04, wpb=434494, bsz=16664.5, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=1, train_wall=95, gb_free=19.6, wall=19652 epoch 012: 1253 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=453093, ups=1.04, wpb=434494, bsz=16664.5, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=1, train_wall=95, gb_free=19.6, wall=19652 epoch 012: 1253 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=453093, ups=1.04, wpb=434494, bsz=16664.5, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=1, train_wall=95, gb_free=19.6, wall=19652 epoch 012: 1253 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=453093, ups=1.04, wpb=434494, bsz=16664.5, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=1, train_wall=95, gb_free=19.6, wall=19652 epoch 012: 1253 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=453093, ups=1.04, wpb=434494, bsz=16664.5, num_updates=19800, lr=0.000449467, gnorm=0.215, clip=0, loss_scale=1, train_wall=95, gb_free=19.6, wall=19652 epoch 012: 1353 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=456542, ups=1.05, wpb=435051, bsz=16905.8, num_updates=19900, lr=0.000448336, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=20.8, wall=19747 epoch 012: 1353 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=456542, ups=1.05, wpb=435051, bsz=16905.8, num_updates=19900, lr=0.000448336, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=20.8, wall=19747 epoch 012: 1353 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=456542, ups=1.05, wpb=435051, bsz=16905.8, num_updates=19900, lr=0.000448336, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=20.8, wall=19747 epoch 012: 1353 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=456542, ups=1.05, wpb=435051, bsz=16905.8, num_updates=19900, lr=0.000448336, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=20.8, wall=19747 epoch 012: 1353 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=456542, ups=1.05, wpb=435051, bsz=16905.8, num_updates=19900, lr=0.000448336, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=20.8, wall=19747 epoch 012: 1353 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=456542, ups=1.05, wpb=435051, bsz=16905.8, num_updates=19900, lr=0.000448336, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=20.8, wall=19747 epoch 012: 1353 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=456542, ups=1.05, wpb=435051, bsz=16905.8, num_updates=19900, lr=0.000448336, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=20.8, wall=19747 epoch 012: 1353 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=456542, ups=1.05, wpb=435051, bsz=16905.8, num_updates=19900, lr=0.000448336, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=20.8, wall=19747 epoch 012: 1353 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=456542, ups=1.05, wpb=435051, bsz=16905.8, num_updates=19900, lr=0.000448336, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=20.8, wall=19747 epoch 012: 1353 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=456542, ups=1.05, wpb=435051, bsz=16905.8, num_updates=19900, lr=0.000448336, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=20.8, wall=19747 epoch 012: 1353 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=456542, ups=1.05, wpb=435051, bsz=16905.8, num_updates=19900, lr=0.000448336, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=20.8, wall=19747 epoch 012: 1353 / 1689 loss=4.133, nll_loss=2.512, ppl=5.7, wps=456542, ups=1.05, wpb=435051, bsz=16905.8, num_updates=19900, lr=0.000448336, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=20.8, wall=19747 epoch 012: 1454 / 1689 loss=4.136, nll_loss=2.515, ppl=5.72, wps=454395, ups=1.04, wpb=435369, bsz=16246.9, num_updates=20000, lr=0.000447214, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=19, wall=19843 epoch 012: 1454 / 1689 loss=4.136, nll_loss=2.515, ppl=5.72, wps=454395, ups=1.04, wpb=435369, bsz=16246.9, num_updates=20000, lr=0.000447214, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=19, wall=19843 epoch 012: 1454 / 1689 loss=4.136, nll_loss=2.515, ppl=5.72, wps=454395, ups=1.04, wpb=435369, bsz=16246.9, num_updates=20000, lr=0.000447214, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=19, wall=19843 epoch 012: 1454 / 1689 loss=4.136, nll_loss=2.515, ppl=5.72, wps=454395, ups=1.04, wpb=435369, bsz=16246.9, num_updates=20000, lr=0.000447214, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=19, wall=19843 epoch 012: 1454 / 1689 loss=4.136, nll_loss=2.515, ppl=5.72, wps=454395, ups=1.04, wpb=435369, bsz=16246.9, num_updates=20000, lr=0.000447214, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=19, wall=19843 epoch 012: 1454 / 1689 loss=4.136, nll_loss=2.515, ppl=5.72, wps=454395, ups=1.04, wpb=435369, bsz=16246.9, num_updates=20000, lr=0.000447214, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=19, wall=19843 epoch 012: 1454 / 1689 loss=4.136, nll_loss=2.515, ppl=5.72, wps=454395, ups=1.04, wpb=435369, bsz=16246.9, num_updates=20000, lr=0.000447214, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=19, wall=19843 epoch 012: 1454 / 1689 loss=4.136, nll_loss=2.515, ppl=5.72, wps=454395, ups=1.04, wpb=435369, bsz=16246.9, num_updates=20000, lr=0.000447214, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=19, wall=19843 epoch 012: 1454 / 1689 loss=4.136, nll_loss=2.515, ppl=5.72, wps=454395, ups=1.04, wpb=435369, bsz=16246.9, num_updates=20000, lr=0.000447214, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=19, wall=19843 epoch 012: 1454 / 1689 loss=4.136, nll_loss=2.515, ppl=5.72, wps=454395, ups=1.04, wpb=435369, bsz=16246.9, num_updates=20000, lr=0.000447214, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=19, wall=19843 epoch 012: 1454 / 1689 loss=4.136, nll_loss=2.515, ppl=5.72, wps=454395, ups=1.04, wpb=435369, bsz=16246.9, num_updates=20000, lr=0.000447214, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=19, wall=19843 epoch 012: 1454 / 1689 loss=4.136, nll_loss=2.515, ppl=5.72, wps=454395, ups=1.04, wpb=435369, bsz=16246.9, num_updates=20000, lr=0.000447214, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=19, wall=19843 begin validation on "valid" subset epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.285 epoch 012 | valid on 'valid' subset | loss 4.289 | nll_loss 2.67 | ppl 6.36 | wps 0 | wpb 42662 | bsz 2032 | num_updates 20000 | best_loss 4.285 epoch 012: 1554 / 1689 loss=4.138, nll_loss=2.518, ppl=5.73, wps=306991, ups=0.71, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=124, gb_free=18.9, wall=19984 epoch 012: 1554 / 1689 loss=4.138, nll_loss=2.518, ppl=5.73, wps=306991, ups=0.71, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=124, gb_free=18.9, wall=19984 epoch 012: 1554 / 1689 loss=4.138, nll_loss=2.518, ppl=5.73, wps=306991, ups=0.71, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=124, gb_free=18.9, wall=19984 epoch 012: 1554 / 1689 loss=4.138, nll_loss=2.518, ppl=5.73, wps=306991, ups=0.71, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=124, gb_free=18.9, wall=19984 epoch 012: 1554 / 1689 loss=4.138, nll_loss=2.518, ppl=5.73, wps=306991, ups=0.71, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=124, gb_free=18.9, wall=19984 epoch 012: 1554 / 1689 loss=4.138, nll_loss=2.518, ppl=5.73, wps=306991, ups=0.71, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=124, gb_free=18.9, wall=19984 epoch 012: 1554 / 1689 loss=4.138, nll_loss=2.518, ppl=5.73, wps=306991, ups=0.71, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=124, gb_free=18.9, wall=19984 epoch 012: 1554 / 1689 loss=4.138, nll_loss=2.518, ppl=5.73, wps=306991, ups=0.71, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=124, gb_free=18.9, wall=19984 epoch 012: 1554 / 1689 loss=4.138, nll_loss=2.518, ppl=5.73, wps=306991, ups=0.71, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=124, gb_free=18.9, wall=19984 epoch 012: 1554 / 1689 loss=4.138, nll_loss=2.518, ppl=5.73, wps=306991, ups=0.71, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=124, gb_free=18.9, wall=19984 epoch 012: 1554 / 1689 loss=4.138, nll_loss=2.518, ppl=5.73, wps=306991, ups=0.71, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=124, gb_free=18.9, wall=19984 epoch 012: 1554 / 1689 loss=4.138, nll_loss=2.518, ppl=5.73, wps=306991, ups=0.71, wpb=434228, bsz=16533.5, num_updates=20100, lr=0.0004461, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=124, gb_free=18.9, wall=19984 epoch 012: 1654 / 1689 loss=4.134, nll_loss=2.513, ppl=5.71, wps=459727, ups=1.06, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20078 epoch 012: 1654 / 1689 loss=4.134, nll_loss=2.513, ppl=5.71, wps=459727, ups=1.06, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20078 epoch 012: 1654 / 1689 loss=4.134, nll_loss=2.513, ppl=5.71, wps=459727, ups=1.06, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20078 epoch 012: 1654 / 1689 loss=4.134, nll_loss=2.513, ppl=5.71, wps=459727, ups=1.06, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20078 epoch 012: 1654 / 1689 loss=4.134, nll_loss=2.513, ppl=5.71, wps=459727, ups=1.06, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20078 epoch 012: 1654 / 1689 loss=4.134, nll_loss=2.513, ppl=5.71, wps=459727, ups=1.06, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20078 epoch 012: 1654 / 1689 loss=4.134, nll_loss=2.513, ppl=5.71, wps=459727, ups=1.06, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20078 epoch 012: 1654 / 1689 loss=4.134, nll_loss=2.513, ppl=5.71, wps=459727, ups=1.06, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20078 epoch 012: 1654 / 1689 loss=4.134, nll_loss=2.513, ppl=5.71, wps=459727, ups=1.06, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20078 epoch 012: 1654 / 1689 loss=4.134, nll_loss=2.513, ppl=5.71, wps=459727, ups=1.06, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20078 epoch 012: 1654 / 1689 loss=4.134, nll_loss=2.513, ppl=5.71, wps=459727, ups=1.06, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20078 epoch 012: 1654 / 1689 loss=4.134, nll_loss=2.513, ppl=5.71, wps=459727, ups=1.06, wpb=432759, bsz=16497.9, num_updates=20200, lr=0.000444994, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20078 end of epoch 12 (average epoch stats below) epoch 012 | loss 4.121 | nll_loss 2.498 | ppl 5.65 | wps 434552 | ups 1 | wpb 433536 | bsz 16507.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.219 | clip 0 | loss_scale 0.5 | train_wall 1598 | gb_free 20.7 | wall 20111 epoch 012 | loss 4.121 | nll_loss 2.498 | ppl 5.65 | wps 434552 | ups 1 | wpb 433536 | bsz 16507.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.219 | clip 0 | loss_scale 0.5 | train_wall 1598 | gb_free 20.7 | wall 20111 epoch 012 | loss 4.121 | nll_loss 2.498 | ppl 5.65 | wps 434552 | ups 1 | wpb 433536 | bsz 16507.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.219 | clip 0 | loss_scale 0.5 | train_wall 1598 | gb_free 20.7 | wall 20111 epoch 012 | loss 4.121 | nll_loss 2.498 | ppl 5.65 | wps 434552 | ups 1 | wpb 433536 | bsz 16507.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.219 | clip 0 | loss_scale 0.5 | train_wall 1598 | gb_free 20.7 | wall 20111 epoch 012 | loss 4.121 | nll_loss 2.498 | ppl 5.65 | wps 434552 | ups 1 | wpb 433536 | bsz 16507.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.219 | clip 0 | loss_scale 0.5 | train_wall 1598 | gb_free 20.7 | wall 20111 epoch 012 | loss 4.121 | nll_loss 2.498 | ppl 5.65 | wps 434552 | ups 1 | wpb 433536 | bsz 16507.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.219 | clip 0 | loss_scale 0.5 | train_wall 1598 | gb_free 20.7 | wall 20111 epoch 012 | loss 4.121 | nll_loss 2.498 | ppl 5.65 | wps 434552 | ups 1 | wpb 433536 | bsz 16507.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.219 | clip 0 | loss_scale 0.5 | train_wall 1598 | gb_free 20.7 | wall 20111 epoch 012 | loss 4.121 | nll_loss 2.498 | ppl 5.65 | wps 434552 | ups 1 | wpb 433536 | bsz 16507.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.219 | clip 0 | loss_scale 0.5 | train_wall 1598 | gb_free 20.7 | wall 20111 epoch 012 | loss 4.121 | nll_loss 2.498 | ppl 5.65 | wps 434552 | ups 1 | wpb 433536 | bsz 16507.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.219 | clip 0 | loss_scale 0.5 | train_wall 1598 | gb_free 20.7 | wall 20111 epoch 012 | loss 4.121 | nll_loss 2.498 | ppl 5.65 | wps 434552 | ups 1 | wpb 433536 | bsz 16507.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.219 | clip 0 | loss_scale 0.5 | train_wall 1598 | gb_free 20.7 | wall 20111 epoch 012 | loss 4.121 | nll_loss 2.498 | ppl 5.65 | wps 434552 | ups 1 | wpb 433536 | bsz 16507.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.219 | clip 0 | loss_scale 0.5 | train_wall 1598 | gb_free 20.7 | wall 20111 epoch 012 | loss 4.121 | nll_loss 2.498 | ppl 5.65 | wps 434552 | ups 1 | wpb 433536 | bsz 16507.9 | num_updates 20235 | lr 0.000444609 | gnorm 0.219 | clip 0 | loss_scale 0.5 | train_wall 1598 | gb_free 20.7 | wall 20111 Start iterating over samples epoch 013: 65 / 1689 loss=4.093, nll_loss=2.465, ppl=5.52, wps=456683, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=20172 epoch 013: 65 / 1689 loss=4.093, nll_loss=2.465, ppl=5.52, wps=456683, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=20172 epoch 013: 65 / 1689 loss=4.093, nll_loss=2.465, ppl=5.52, wps=456683, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=20172 epoch 013: 65 / 1689 loss=4.093, nll_loss=2.465, ppl=5.52, wps=456683, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=20172 epoch 013: 65 / 1689 loss=4.093, nll_loss=2.465, ppl=5.52, wps=456683, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=20172 epoch 013: 65 / 1689 loss=4.093, nll_loss=2.465, ppl=5.52, wps=456683, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=20172 epoch 013: 65 / 1689 loss=4.093, nll_loss=2.465, ppl=5.52, wps=456683, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=20172 epoch 013: 65 / 1689 loss=4.093, nll_loss=2.465, ppl=5.52, wps=456683, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=20172 epoch 013: 65 / 1689 loss=4.093, nll_loss=2.465, ppl=5.52, wps=456683, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=20172 epoch 013: 65 / 1689 loss=4.093, nll_loss=2.465, ppl=5.52, wps=456683, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=20172 epoch 013: 65 / 1689 loss=4.093, nll_loss=2.465, ppl=5.52, wps=456683, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=20172 epoch 013: 65 / 1689 loss=4.093, nll_loss=2.465, ppl=5.52, wps=456683, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=20172 epoch 013: 65 / 1689 loss=4.093, nll_loss=2.465, ppl=5.52, wps=456683, ups=1.06, wpb=429637, bsz=16192.2, num_updates=20300, lr=0.000443897, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=20172 epoch 013: 165 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=462028, ups=1.06, wpb=434286, bsz=16400.6, num_updates=20400, lr=0.000442807, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=20266 epoch 013: 165 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=462028, ups=1.06, wpb=434286, bsz=16400.6, num_updates=20400, lr=0.000442807, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=20266 epoch 013: 165 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=462028, ups=1.06, wpb=434286, bsz=16400.6, num_updates=20400, lr=0.000442807, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=20266 epoch 013: 165 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=462028, ups=1.06, wpb=434286, bsz=16400.6, num_updates=20400, lr=0.000442807, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=20266 epoch 013: 165 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=462028, ups=1.06, wpb=434286, bsz=16400.6, num_updates=20400, lr=0.000442807, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=20266 epoch 013: 165 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=462028, ups=1.06, wpb=434286, bsz=16400.6, num_updates=20400, lr=0.000442807, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=20266 epoch 013: 165 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=462028, ups=1.06, wpb=434286, bsz=16400.6, num_updates=20400, lr=0.000442807, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=20266 epoch 013: 165 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=462028, ups=1.06, wpb=434286, bsz=16400.6, num_updates=20400, lr=0.000442807, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=20266 epoch 013: 165 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=462028, ups=1.06, wpb=434286, bsz=16400.6, num_updates=20400, lr=0.000442807, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=20266 epoch 013: 165 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=462028, ups=1.06, wpb=434286, bsz=16400.6, num_updates=20400, lr=0.000442807, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=20266 epoch 013: 165 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=462028, ups=1.06, wpb=434286, bsz=16400.6, num_updates=20400, lr=0.000442807, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=20266 epoch 013: 165 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=462028, ups=1.06, wpb=434286, bsz=16400.6, num_updates=20400, lr=0.000442807, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=20266 epoch 013: 165 / 1689 loss=4.082, nll_loss=2.452, ppl=5.47, wps=462028, ups=1.06, wpb=434286, bsz=16400.6, num_updates=20400, lr=0.000442807, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=20266 epoch 013: 265 / 1689 loss=4.093, nll_loss=2.466, ppl=5.52, wps=461995, ups=1.06, wpb=435020, bsz=16379.7, num_updates=20500, lr=0.000441726, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=20361 epoch 013: 265 / 1689 loss=4.093, nll_loss=2.466, ppl=5.52, wps=461995, ups=1.06, wpb=435020, bsz=16379.7, num_updates=20500, lr=0.000441726, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=20361 epoch 013: 265 / 1689 loss=4.093, nll_loss=2.466, ppl=5.52, wps=461995, ups=1.06, wpb=435020, bsz=16379.7, num_updates=20500, lr=0.000441726, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=20361 epoch 013: 265 / 1689 loss=4.093, nll_loss=2.466, ppl=5.52, wps=461995, ups=1.06, wpb=435020, bsz=16379.7, num_updates=20500, lr=0.000441726, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=20361 epoch 013: 265 / 1689 loss=4.093, nll_loss=2.466, ppl=5.52, wps=461995, ups=1.06, wpb=435020, bsz=16379.7, num_updates=20500, lr=0.000441726, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=20361 epoch 013: 265 / 1689 loss=4.093, nll_loss=2.466, ppl=5.52, wps=461995, ups=1.06, wpb=435020, bsz=16379.7, num_updates=20500, lr=0.000441726, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=20361 epoch 013: 265 / 1689 loss=4.093, nll_loss=2.466, ppl=5.52, wps=461995, ups=1.06, wpb=435020, bsz=16379.7, num_updates=20500, lr=0.000441726, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=20361 epoch 013: 265 / 1689 loss=4.093, nll_loss=2.466, ppl=5.52, wps=461995, ups=1.06, wpb=435020, bsz=16379.7, num_updates=20500, lr=0.000441726, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=20361 epoch 013: 265 / 1689 loss=4.093, nll_loss=2.466, ppl=5.52, wps=461995, ups=1.06, wpb=435020, bsz=16379.7, num_updates=20500, lr=0.000441726, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=20361 epoch 013: 265 / 1689 loss=4.093, nll_loss=2.466, ppl=5.52, wps=461995, ups=1.06, wpb=435020, bsz=16379.7, num_updates=20500, lr=0.000441726, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=20361 epoch 013: 265 / 1689 loss=4.093, nll_loss=2.466, ppl=5.52, wps=461995, ups=1.06, wpb=435020, bsz=16379.7, num_updates=20500, lr=0.000441726, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=20361 epoch 013: 265 / 1689 loss=4.093, nll_loss=2.466, ppl=5.52, wps=461995, ups=1.06, wpb=435020, bsz=16379.7, num_updates=20500, lr=0.000441726, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=20361 epoch 013: 265 / 1689 loss=4.093, nll_loss=2.466, ppl=5.52, wps=461995, ups=1.06, wpb=435020, bsz=16379.7, num_updates=20500, lr=0.000441726, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=20361 epoch 013: 365 / 1689 loss=4.084, nll_loss=2.456, ppl=5.49, wps=455806, ups=1.05, wpb=433469, bsz=16736.5, num_updates=20600, lr=0.000440653, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=20456 epoch 013: 365 / 1689 loss=4.084, nll_loss=2.456, ppl=5.49, wps=455806, ups=1.05, wpb=433469, bsz=16736.5, num_updates=20600, lr=0.000440653, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=20456 epoch 013: 365 / 1689 loss=4.084, nll_loss=2.456, ppl=5.49, wps=455806, ups=1.05, wpb=433469, bsz=16736.5, num_updates=20600, lr=0.000440653, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=20456 epoch 013: 365 / 1689 loss=4.084, nll_loss=2.456, ppl=5.49, wps=455806, ups=1.05, wpb=433469, bsz=16736.5, num_updates=20600, lr=0.000440653, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=20456 epoch 013: 365 / 1689 loss=4.084, nll_loss=2.456, ppl=5.49, wps=455806, ups=1.05, wpb=433469, bsz=16736.5, num_updates=20600, lr=0.000440653, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=20456 epoch 013: 365 / 1689 loss=4.084, nll_loss=2.456, ppl=5.49, wps=455806, ups=1.05, wpb=433469, bsz=16736.5, num_updates=20600, lr=0.000440653, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=20456 epoch 013: 365 / 1689 loss=4.084, nll_loss=2.456, ppl=5.49, wps=455806, ups=1.05, wpb=433469, bsz=16736.5, num_updates=20600, lr=0.000440653, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=20456 epoch 013: 365 / 1689 loss=4.084, nll_loss=2.456, ppl=5.49, wps=455806, ups=1.05, wpb=433469, bsz=16736.5, num_updates=20600, lr=0.000440653, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=20456 epoch 013: 365 / 1689 loss=4.084, nll_loss=2.456, ppl=5.49, wps=455806, ups=1.05, wpb=433469, bsz=16736.5, num_updates=20600, lr=0.000440653, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=20456 epoch 013: 365 / 1689 loss=4.084, nll_loss=2.456, ppl=5.49, wps=455806, ups=1.05, wpb=433469, bsz=16736.5, num_updates=20600, lr=0.000440653, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=20456 epoch 013: 365 / 1689 loss=4.084, nll_loss=2.456, ppl=5.49, wps=455806, ups=1.05, wpb=433469, bsz=16736.5, num_updates=20600, lr=0.000440653, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=20456 epoch 013: 365 / 1689 loss=4.084, nll_loss=2.456, ppl=5.49, wps=455806, ups=1.05, wpb=433469, bsz=16736.5, num_updates=20600, lr=0.000440653, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=20456 epoch 013: 365 / 1689 loss=4.084, nll_loss=2.456, ppl=5.49, wps=455806, ups=1.05, wpb=433469, bsz=16736.5, num_updates=20600, lr=0.000440653, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.7, wall=20456 epoch 013: 465 / 1689 loss=4.088, nll_loss=2.46, ppl=5.5, wps=455883, ups=1.05, wpb=432214, bsz=16649, num_updates=20700, lr=0.000439587, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=20550 epoch 013: 465 / 1689 loss=4.088, nll_loss=2.46, ppl=5.5, wps=455883, ups=1.05, wpb=432214, bsz=16649, num_updates=20700, lr=0.000439587, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=20550 epoch 013: 465 / 1689 loss=4.088, nll_loss=2.46, ppl=5.5, wps=455883, ups=1.05, wpb=432214, bsz=16649, num_updates=20700, lr=0.000439587, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=20550 epoch 013: 465 / 1689 loss=4.088, nll_loss=2.46, ppl=5.5, wps=455883, ups=1.05, wpb=432214, bsz=16649, num_updates=20700, lr=0.000439587, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=20550 epoch 013: 465 / 1689 loss=4.088, nll_loss=2.46, ppl=5.5, wps=455883, ups=1.05, wpb=432214, bsz=16649, num_updates=20700, lr=0.000439587, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=20550 epoch 013: 465 / 1689 loss=4.088, nll_loss=2.46, ppl=5.5, wps=455883, ups=1.05, wpb=432214, bsz=16649, num_updates=20700, lr=0.000439587, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=20550 epoch 013: 465 / 1689 loss=4.088, nll_loss=2.46, ppl=5.5, wps=455883, ups=1.05, wpb=432214, bsz=16649, num_updates=20700, lr=0.000439587, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=20550 epoch 013: 465 / 1689 loss=4.088, nll_loss=2.46, ppl=5.5, wps=455883, ups=1.05, wpb=432214, bsz=16649, num_updates=20700, lr=0.000439587, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=20550 epoch 013: 465 / 1689 loss=4.088, nll_loss=2.46, ppl=5.5, wps=455883, ups=1.05, wpb=432214, bsz=16649, num_updates=20700, lr=0.000439587, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=20550 epoch 013: 465 / 1689 loss=4.088, nll_loss=2.46, ppl=5.5, wps=455883, ups=1.05, wpb=432214, bsz=16649, num_updates=20700, lr=0.000439587, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=20550 epoch 013: 465 / 1689 loss=4.088, nll_loss=2.46, ppl=5.5, wps=455883, ups=1.05, wpb=432214, bsz=16649, num_updates=20700, lr=0.000439587, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=20550 epoch 013: 465 / 1689 loss=4.088, nll_loss=2.46, ppl=5.5, wps=455883, ups=1.05, wpb=432214, bsz=16649, num_updates=20700, lr=0.000439587, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=20550 epoch 013: 465 / 1689 loss=4.088, nll_loss=2.46, ppl=5.5, wps=455883, ups=1.05, wpb=432214, bsz=16649, num_updates=20700, lr=0.000439587, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=20.2, wall=20550 epoch 013: 565 / 1689 loss=4.104, nll_loss=2.478, ppl=5.57, wps=457354, ups=1.05, wpb=433587, bsz=16124.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20645 epoch 013: 565 / 1689 loss=4.104, nll_loss=2.478, ppl=5.57, wps=457354, ups=1.05, wpb=433587, bsz=16124.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20645 epoch 013: 565 / 1689 loss=4.104, nll_loss=2.478, ppl=5.57, wps=457354, ups=1.05, wpb=433587, bsz=16124.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20645 epoch 013: 565 / 1689 loss=4.104, nll_loss=2.478, ppl=5.57, wps=457354, ups=1.05, wpb=433587, bsz=16124.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20645 epoch 013: 565 / 1689 loss=4.104, nll_loss=2.478, ppl=5.57, wps=457354, ups=1.05, wpb=433587, bsz=16124.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20645 epoch 013: 565 / 1689 loss=4.104, nll_loss=2.478, ppl=5.57, wps=457354, ups=1.05, wpb=433587, bsz=16124.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20645 epoch 013: 565 / 1689 loss=4.104, nll_loss=2.478, ppl=5.57, wps=457354, ups=1.05, wpb=433587, bsz=16124.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20645 epoch 013: 565 / 1689 loss=4.104, nll_loss=2.478, ppl=5.57, wps=457354, ups=1.05, wpb=433587, bsz=16124.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20645 epoch 013: 565 / 1689 loss=4.104, nll_loss=2.478, ppl=5.57, wps=457354, ups=1.05, wpb=433587, bsz=16124.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20645 epoch 013: 565 / 1689 loss=4.104, nll_loss=2.478, ppl=5.57, wps=457354, ups=1.05, wpb=433587, bsz=16124.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20645 epoch 013: 565 / 1689 loss=4.104, nll_loss=2.478, ppl=5.57, wps=457354, ups=1.05, wpb=433587, bsz=16124.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20645 epoch 013: 565 / 1689 loss=4.104, nll_loss=2.478, ppl=5.57, wps=457354, ups=1.05, wpb=433587, bsz=16124.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20645 epoch 013: 565 / 1689 loss=4.104, nll_loss=2.478, ppl=5.57, wps=457354, ups=1.05, wpb=433587, bsz=16124.8, num_updates=20800, lr=0.000438529, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=20645 epoch 013: 665 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=462598, ups=1.06, wpb=435234, bsz=17050.8, num_updates=20900, lr=0.000437479, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=20739 epoch 013: 665 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=462598, ups=1.06, wpb=435234, bsz=17050.8, num_updates=20900, lr=0.000437479, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=20739 epoch 013: 665 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=462598, ups=1.06, wpb=435234, bsz=17050.8, num_updates=20900, lr=0.000437479, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=20739 epoch 013: 665 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=462598, ups=1.06, wpb=435234, bsz=17050.8, num_updates=20900, lr=0.000437479, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=20739 epoch 013: 665 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=462598, ups=1.06, wpb=435234, bsz=17050.8, num_updates=20900, lr=0.000437479, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=20739 epoch 013: 665 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=462598, ups=1.06, wpb=435234, bsz=17050.8, num_updates=20900, lr=0.000437479, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=20739 epoch 013: 665 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=462598, ups=1.06, wpb=435234, bsz=17050.8, num_updates=20900, lr=0.000437479, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=20739 epoch 013: 665 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=462598, ups=1.06, wpb=435234, bsz=17050.8, num_updates=20900, lr=0.000437479, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=20739 epoch 013: 665 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=462598, ups=1.06, wpb=435234, bsz=17050.8, num_updates=20900, lr=0.000437479, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=20739 epoch 013: 665 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=462598, ups=1.06, wpb=435234, bsz=17050.8, num_updates=20900, lr=0.000437479, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=20739 epoch 013: 665 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=462598, ups=1.06, wpb=435234, bsz=17050.8, num_updates=20900, lr=0.000437479, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=20739 epoch 013: 665 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=462598, ups=1.06, wpb=435234, bsz=17050.8, num_updates=20900, lr=0.000437479, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=20739 epoch 013: 665 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=462598, ups=1.06, wpb=435234, bsz=17050.8, num_updates=20900, lr=0.000437479, gnorm=0.222, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=20739 epoch 013: 766 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=458417, ups=1.06, wpb=432970, bsz=16500.6, num_updates=21000, lr=0.000436436, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=20834 epoch 013: 766 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=458417, ups=1.06, wpb=432970, bsz=16500.6, num_updates=21000, lr=0.000436436, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=20834 epoch 013: 766 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=458417, ups=1.06, wpb=432970, bsz=16500.6, num_updates=21000, lr=0.000436436, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=20834 epoch 013: 766 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=458417, ups=1.06, wpb=432970, bsz=16500.6, num_updates=21000, lr=0.000436436, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=20834 epoch 013: 766 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=458417, ups=1.06, wpb=432970, bsz=16500.6, num_updates=21000, lr=0.000436436, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=20834 epoch 013: 766 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=458417, ups=1.06, wpb=432970, bsz=16500.6, num_updates=21000, lr=0.000436436, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=20834 epoch 013: 766 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=458417, ups=1.06, wpb=432970, bsz=16500.6, num_updates=21000, lr=0.000436436, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=20834 epoch 013: 766 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=458417, ups=1.06, wpb=432970, bsz=16500.6, num_updates=21000, lr=0.000436436, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=20834 epoch 013: 766 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=458417, ups=1.06, wpb=432970, bsz=16500.6, num_updates=21000, lr=0.000436436, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=20834 epoch 013: 766 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=458417, ups=1.06, wpb=432970, bsz=16500.6, num_updates=21000, lr=0.000436436, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=20834 epoch 013: 766 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=458417, ups=1.06, wpb=432970, bsz=16500.6, num_updates=21000, lr=0.000436436, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=20834 epoch 013: 766 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=458417, ups=1.06, wpb=432970, bsz=16500.6, num_updates=21000, lr=0.000436436, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=20834 epoch 013: 766 / 1689 loss=4.108, nll_loss=2.484, ppl=5.59, wps=458417, ups=1.06, wpb=432970, bsz=16500.6, num_updates=21000, lr=0.000436436, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=20834 begin validation on "valid" subset epoch 013 | valid on 'valid' subset | loss 4.273 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.273 epoch 013 | valid on 'valid' subset | loss 4.273 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.273 epoch 013 | valid on 'valid' subset | loss 4.273 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.273 epoch 013 | valid on 'valid' subset | loss 4.273 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.273 epoch 013 | valid on 'valid' subset | loss 4.273 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.273 epoch 013 | valid on 'valid' subset | loss 4.273 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.273 epoch 013 | valid on 'valid' subset | loss 4.273 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.273 epoch 013 | valid on 'valid' subset | loss 4.273 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.273 epoch 013 | valid on 'valid' subset | loss 4.273 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.273 epoch 013 | valid on 'valid' subset | loss 4.273 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.273 epoch 013 | valid on 'valid' subset | loss 4.273 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.273 epoch 013 | valid on 'valid' subset | loss 4.273 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.273 epoch 013 | valid on 'valid' subset | loss 4.273 | nll_loss 2.651 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 21000 | best_loss 4.273 epoch 013: 866 / 1689 loss=4.112, nll_loss=2.488, ppl=5.61, wps=387982, ups=0.89, wpb=434963, bsz=16673.8, num_updates=21100, lr=0.0004354, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20946 epoch 013: 866 / 1689 loss=4.112, nll_loss=2.488, ppl=5.61, wps=387982, ups=0.89, wpb=434963, bsz=16673.8, num_updates=21100, lr=0.0004354, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20946 epoch 013: 866 / 1689 loss=4.112, nll_loss=2.488, ppl=5.61, wps=387982, ups=0.89, wpb=434963, bsz=16673.8, num_updates=21100, lr=0.0004354, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20946 epoch 013: 866 / 1689 loss=4.112, nll_loss=2.488, ppl=5.61, wps=387982, ups=0.89, wpb=434963, bsz=16673.8, num_updates=21100, lr=0.0004354, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20946 epoch 013: 866 / 1689 loss=4.112, nll_loss=2.488, ppl=5.61, wps=387982, ups=0.89, wpb=434963, bsz=16673.8, num_updates=21100, lr=0.0004354, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20946 epoch 013: 866 / 1689 loss=4.112, nll_loss=2.488, ppl=5.61, wps=387982, ups=0.89, wpb=434963, bsz=16673.8, num_updates=21100, lr=0.0004354, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20946 epoch 013: 866 / 1689 loss=4.112, nll_loss=2.488, ppl=5.61, wps=387982, ups=0.89, wpb=434963, bsz=16673.8, num_updates=21100, lr=0.0004354, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20946 epoch 013: 866 / 1689 loss=4.112, nll_loss=2.488, ppl=5.61, wps=387982, ups=0.89, wpb=434963, bsz=16673.8, num_updates=21100, lr=0.0004354, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20946 epoch 013: 866 / 1689 loss=4.112, nll_loss=2.488, ppl=5.61, wps=387982, ups=0.89, wpb=434963, bsz=16673.8, num_updates=21100, lr=0.0004354, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20946 epoch 013: 866 / 1689 loss=4.112, nll_loss=2.488, ppl=5.61, wps=387982, ups=0.89, wpb=434963, bsz=16673.8, num_updates=21100, lr=0.0004354, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20946 epoch 013: 866 / 1689 loss=4.112, nll_loss=2.488, ppl=5.61, wps=387982, ups=0.89, wpb=434963, bsz=16673.8, num_updates=21100, lr=0.0004354, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20946 epoch 013: 866 / 1689 loss=4.112, nll_loss=2.488, ppl=5.61, wps=387982, ups=0.89, wpb=434963, bsz=16673.8, num_updates=21100, lr=0.0004354, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20946 epoch 013: 866 / 1689 loss=4.112, nll_loss=2.488, ppl=5.61, wps=387982, ups=0.89, wpb=434963, bsz=16673.8, num_updates=21100, lr=0.0004354, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=20946 epoch 013: 966 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=459322, ups=1.06, wpb=432010, bsz=16420.3, num_updates=21200, lr=0.000434372, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=21040 epoch 013: 966 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=459322, ups=1.06, wpb=432010, bsz=16420.3, num_updates=21200, lr=0.000434372, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=21040 epoch 013: 966 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=459322, ups=1.06, wpb=432010, bsz=16420.3, num_updates=21200, lr=0.000434372, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=21040 epoch 013: 966 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=459322, ups=1.06, wpb=432010, bsz=16420.3, num_updates=21200, lr=0.000434372, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=21040 epoch 013: 966 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=459322, ups=1.06, wpb=432010, bsz=16420.3, num_updates=21200, lr=0.000434372, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=21040 epoch 013: 966 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=459322, ups=1.06, wpb=432010, bsz=16420.3, num_updates=21200, lr=0.000434372, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=21040 epoch 013: 966 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=459322, ups=1.06, wpb=432010, bsz=16420.3, num_updates=21200, lr=0.000434372, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=21040 epoch 013: 966 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=459322, ups=1.06, wpb=432010, bsz=16420.3, num_updates=21200, lr=0.000434372, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=21040 epoch 013: 966 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=459322, ups=1.06, wpb=432010, bsz=16420.3, num_updates=21200, lr=0.000434372, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=21040 epoch 013: 966 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=459322, ups=1.06, wpb=432010, bsz=16420.3, num_updates=21200, lr=0.000434372, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=21040 epoch 013: 966 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=459322, ups=1.06, wpb=432010, bsz=16420.3, num_updates=21200, lr=0.000434372, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=21040 epoch 013: 966 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=459322, ups=1.06, wpb=432010, bsz=16420.3, num_updates=21200, lr=0.000434372, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=21040 epoch 013: 966 / 1689 loss=4.108, nll_loss=2.483, ppl=5.59, wps=459322, ups=1.06, wpb=432010, bsz=16420.3, num_updates=21200, lr=0.000434372, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=21040 epoch 013: 1066 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=464714, ups=1.07, wpb=435755, bsz=16417.8, num_updates=21300, lr=0.000433351, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=21134 epoch 013: 1066 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=464714, ups=1.07, wpb=435755, bsz=16417.8, num_updates=21300, lr=0.000433351, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=21134 epoch 013: 1066 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=464714, ups=1.07, wpb=435755, bsz=16417.8, num_updates=21300, lr=0.000433351, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=21134 epoch 013: 1066 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=464714, ups=1.07, wpb=435755, bsz=16417.8, num_updates=21300, lr=0.000433351, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=21134 epoch 013: 1066 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=464714, ups=1.07, wpb=435755, bsz=16417.8, num_updates=21300, lr=0.000433351, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=21134 epoch 013: 1066 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=464714, ups=1.07, wpb=435755, bsz=16417.8, num_updates=21300, lr=0.000433351, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=21134 epoch 013: 1066 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=464714, ups=1.07, wpb=435755, bsz=16417.8, num_updates=21300, lr=0.000433351, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=21134 epoch 013: 1066 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=464714, ups=1.07, wpb=435755, bsz=16417.8, num_updates=21300, lr=0.000433351, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=21134 epoch 013: 1066 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=464714, ups=1.07, wpb=435755, bsz=16417.8, num_updates=21300, lr=0.000433351, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=21134 epoch 013: 1066 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=464714, ups=1.07, wpb=435755, bsz=16417.8, num_updates=21300, lr=0.000433351, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=21134 epoch 013: 1066 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=464714, ups=1.07, wpb=435755, bsz=16417.8, num_updates=21300, lr=0.000433351, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=21134 epoch 013: 1066 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=464714, ups=1.07, wpb=435755, bsz=16417.8, num_updates=21300, lr=0.000433351, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=21134 epoch 013: 1066 / 1689 loss=4.114, nll_loss=2.49, ppl=5.62, wps=464714, ups=1.07, wpb=435755, bsz=16417.8, num_updates=21300, lr=0.000433351, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=21134 epoch 013: 1166 / 1689 loss=4.104, nll_loss=2.48, ppl=5.58, wps=463321, ups=1.07, wpb=433880, bsz=16464.4, num_updates=21400, lr=0.000432338, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=21227 epoch 013: 1166 / 1689 loss=4.104, nll_loss=2.48, ppl=5.58, wps=463321, ups=1.07, wpb=433880, bsz=16464.4, num_updates=21400, lr=0.000432338, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=21227 epoch 013: 1166 / 1689 loss=4.104, nll_loss=2.48, ppl=5.58, wps=463321, ups=1.07, wpb=433880, bsz=16464.4, num_updates=21400, lr=0.000432338, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=21227 epoch 013: 1166 / 1689 loss=4.104, nll_loss=2.48, ppl=5.58, wps=463321, ups=1.07, wpb=433880, bsz=16464.4, num_updates=21400, lr=0.000432338, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=21227 epoch 013: 1166 / 1689 loss=4.104, nll_loss=2.48, ppl=5.58, wps=463321, ups=1.07, wpb=433880, bsz=16464.4, num_updates=21400, lr=0.000432338, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=21227 epoch 013: 1166 / 1689 loss=4.104, nll_loss=2.48, ppl=5.58, wps=463321, ups=1.07, wpb=433880, bsz=16464.4, num_updates=21400, lr=0.000432338, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=21227 epoch 013: 1166 / 1689 loss=4.104, nll_loss=2.48, ppl=5.58, wps=463321, ups=1.07, wpb=433880, bsz=16464.4, num_updates=21400, lr=0.000432338, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=21227 epoch 013: 1166 / 1689 loss=4.104, nll_loss=2.48, ppl=5.58, wps=463321, ups=1.07, wpb=433880, bsz=16464.4, num_updates=21400, lr=0.000432338, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=21227 epoch 013: 1166 / 1689 loss=4.104, nll_loss=2.48, ppl=5.58, wps=463321, ups=1.07, wpb=433880, bsz=16464.4, num_updates=21400, lr=0.000432338, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=21227 epoch 013: 1166 / 1689 loss=4.104, nll_loss=2.48, ppl=5.58, wps=463321, ups=1.07, wpb=433880, bsz=16464.4, num_updates=21400, lr=0.000432338, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=21227 epoch 013: 1166 / 1689 loss=4.104, nll_loss=2.48, ppl=5.58, wps=463321, ups=1.07, wpb=433880, bsz=16464.4, num_updates=21400, lr=0.000432338, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=21227 epoch 013: 1166 / 1689 loss=4.104, nll_loss=2.48, ppl=5.58, wps=463321, ups=1.07, wpb=433880, bsz=16464.4, num_updates=21400, lr=0.000432338, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=21227 epoch 013: 1166 / 1689 loss=4.104, nll_loss=2.48, ppl=5.58, wps=463321, ups=1.07, wpb=433880, bsz=16464.4, num_updates=21400, lr=0.000432338, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=21227 epoch 013: 1266 / 1689 loss=4.113, nll_loss=2.489, ppl=5.62, wps=465023, ups=1.07, wpb=433569, bsz=16309.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=21321 epoch 013: 1266 / 1689 loss=4.113, nll_loss=2.489, ppl=5.62, wps=465023, ups=1.07, wpb=433569, bsz=16309.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=21321 epoch 013: 1266 / 1689 loss=4.113, nll_loss=2.489, ppl=5.62, wps=465023, ups=1.07, wpb=433569, bsz=16309.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=21321 epoch 013: 1266 / 1689 loss=4.113, nll_loss=2.489, ppl=5.62, wps=465023, ups=1.07, wpb=433569, bsz=16309.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=21321 epoch 013: 1266 / 1689 loss=4.113, nll_loss=2.489, ppl=5.62, wps=465023, ups=1.07, wpb=433569, bsz=16309.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=21321 epoch 013: 1266 / 1689 loss=4.113, nll_loss=2.489, ppl=5.62, wps=465023, ups=1.07, wpb=433569, bsz=16309.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=21321 epoch 013: 1266 / 1689 loss=4.113, nll_loss=2.489, ppl=5.62, wps=465023, ups=1.07, wpb=433569, bsz=16309.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=21321 epoch 013: 1266 / 1689 loss=4.113, nll_loss=2.489, ppl=5.62, wps=465023, ups=1.07, wpb=433569, bsz=16309.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=21321 epoch 013: 1266 / 1689 loss=4.113, nll_loss=2.489, ppl=5.62, wps=465023, ups=1.07, wpb=433569, bsz=16309.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=21321 epoch 013: 1266 / 1689 loss=4.113, nll_loss=2.489, ppl=5.62, wps=465023, ups=1.07, wpb=433569, bsz=16309.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=21321 epoch 013: 1266 / 1689 loss=4.113, nll_loss=2.489, ppl=5.62, wps=465023, ups=1.07, wpb=433569, bsz=16309.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=21321 epoch 013: 1266 / 1689 loss=4.113, nll_loss=2.489, ppl=5.62, wps=465023, ups=1.07, wpb=433569, bsz=16309.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=21321 epoch 013: 1266 / 1689 loss=4.113, nll_loss=2.489, ppl=5.62, wps=465023, ups=1.07, wpb=433569, bsz=16309.8, num_updates=21500, lr=0.000431331, gnorm=0.231, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=21321 epoch 013: 1366 / 1689 loss=4.122, nll_loss=2.499, ppl=5.65, wps=459328, ups=1.06, wpb=433269, bsz=16786.8, num_updates=21600, lr=0.000430331, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=21415 epoch 013: 1366 / 1689 loss=4.122, nll_loss=2.499, ppl=5.65, wps=459328, ups=1.06, wpb=433269, bsz=16786.8, num_updates=21600, lr=0.000430331, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=21415 epoch 013: 1366 / 1689 loss=4.122, nll_loss=2.499, ppl=5.65, wps=459328, ups=1.06, wpb=433269, bsz=16786.8, num_updates=21600, lr=0.000430331, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=21415 epoch 013: 1366 / 1689 loss=4.122, nll_loss=2.499, ppl=5.65, wps=459328, ups=1.06, wpb=433269, bsz=16786.8, num_updates=21600, lr=0.000430331, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=21415 epoch 013: 1366 / 1689 loss=4.122, nll_loss=2.499, ppl=5.65, wps=459328, ups=1.06, wpb=433269, bsz=16786.8, num_updates=21600, lr=0.000430331, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=21415 epoch 013: 1366 / 1689 loss=4.122, nll_loss=2.499, ppl=5.65, wps=459328, ups=1.06, wpb=433269, bsz=16786.8, num_updates=21600, lr=0.000430331, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=21415 epoch 013: 1366 / 1689 loss=4.122, nll_loss=2.499, ppl=5.65, wps=459328, ups=1.06, wpb=433269, bsz=16786.8, num_updates=21600, lr=0.000430331, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=21415 epoch 013: 1366 / 1689 loss=4.122, nll_loss=2.499, ppl=5.65, wps=459328, ups=1.06, wpb=433269, bsz=16786.8, num_updates=21600, lr=0.000430331, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=21415 epoch 013: 1366 / 1689 loss=4.122, nll_loss=2.499, ppl=5.65, wps=459328, ups=1.06, wpb=433269, bsz=16786.8, num_updates=21600, lr=0.000430331, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=21415 epoch 013: 1366 / 1689 loss=4.122, nll_loss=2.499, ppl=5.65, wps=459328, ups=1.06, wpb=433269, bsz=16786.8, num_updates=21600, lr=0.000430331, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=21415 epoch 013: 1366 / 1689 loss=4.122, nll_loss=2.499, ppl=5.65, wps=459328, ups=1.06, wpb=433269, bsz=16786.8, num_updates=21600, lr=0.000430331, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=21415 epoch 013: 1366 / 1689 loss=4.122, nll_loss=2.499, ppl=5.65, wps=459328, ups=1.06, wpb=433269, bsz=16786.8, num_updates=21600, lr=0.000430331, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=21415 epoch 013: 1366 / 1689 loss=4.122, nll_loss=2.499, ppl=5.65, wps=459328, ups=1.06, wpb=433269, bsz=16786.8, num_updates=21600, lr=0.000430331, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=21415 epoch 013: 1466 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=458587, ups=1.06, wpb=432924, bsz=16585.8, num_updates=21700, lr=0.000429339, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21509 epoch 013: 1466 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=458587, ups=1.06, wpb=432924, bsz=16585.8, num_updates=21700, lr=0.000429339, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21509 epoch 013: 1466 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=458587, ups=1.06, wpb=432924, bsz=16585.8, num_updates=21700, lr=0.000429339, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21509 epoch 013: 1466 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=458587, ups=1.06, wpb=432924, bsz=16585.8, num_updates=21700, lr=0.000429339, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21509 epoch 013: 1466 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=458587, ups=1.06, wpb=432924, bsz=16585.8, num_updates=21700, lr=0.000429339, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21509 epoch 013: 1466 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=458587, ups=1.06, wpb=432924, bsz=16585.8, num_updates=21700, lr=0.000429339, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21509 epoch 013: 1466 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=458587, ups=1.06, wpb=432924, bsz=16585.8, num_updates=21700, lr=0.000429339, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21509 epoch 013: 1466 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=458587, ups=1.06, wpb=432924, bsz=16585.8, num_updates=21700, lr=0.000429339, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21509 epoch 013: 1466 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=458587, ups=1.06, wpb=432924, bsz=16585.8, num_updates=21700, lr=0.000429339, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21509 epoch 013: 1466 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=458587, ups=1.06, wpb=432924, bsz=16585.8, num_updates=21700, lr=0.000429339, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21509 epoch 013: 1466 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=458587, ups=1.06, wpb=432924, bsz=16585.8, num_updates=21700, lr=0.000429339, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21509 epoch 013: 1466 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=458587, ups=1.06, wpb=432924, bsz=16585.8, num_updates=21700, lr=0.000429339, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21509 epoch 013: 1466 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=458587, ups=1.06, wpb=432924, bsz=16585.8, num_updates=21700, lr=0.000429339, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21509 epoch 013: 1566 / 1689 loss=4.117, nll_loss=2.494, ppl=5.63, wps=459695, ups=1.06, wpb=432888, bsz=16634.3, num_updates=21800, lr=0.000428353, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=21604 epoch 013: 1566 / 1689 loss=4.117, nll_loss=2.494, ppl=5.63, wps=459695, ups=1.06, wpb=432888, bsz=16634.3, num_updates=21800, lr=0.000428353, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=21604 epoch 013: 1566 / 1689 loss=4.117, nll_loss=2.494, ppl=5.63, wps=459695, ups=1.06, wpb=432888, bsz=16634.3, num_updates=21800, lr=0.000428353, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=21604 epoch 013: 1566 / 1689 loss=4.117, nll_loss=2.494, ppl=5.63, wps=459695, ups=1.06, wpb=432888, bsz=16634.3, num_updates=21800, lr=0.000428353, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=21604 epoch 013: 1566 / 1689 loss=4.117, nll_loss=2.494, ppl=5.63, wps=459695, ups=1.06, wpb=432888, bsz=16634.3, num_updates=21800, lr=0.000428353, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=21604 epoch 013: 1566 / 1689 loss=4.117, nll_loss=2.494, ppl=5.63, wps=459695, ups=1.06, wpb=432888, bsz=16634.3, num_updates=21800, lr=0.000428353, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=21604 epoch 013: 1566 / 1689 loss=4.117, nll_loss=2.494, ppl=5.63, wps=459695, ups=1.06, wpb=432888, bsz=16634.3, num_updates=21800, lr=0.000428353, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=21604 epoch 013: 1566 / 1689 loss=4.117, nll_loss=2.494, ppl=5.63, wps=459695, ups=1.06, wpb=432888, bsz=16634.3, num_updates=21800, lr=0.000428353, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=21604 epoch 013: 1566 / 1689 loss=4.117, nll_loss=2.494, ppl=5.63, wps=459695, ups=1.06, wpb=432888, bsz=16634.3, num_updates=21800, lr=0.000428353, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=21604 epoch 013: 1566 / 1689 loss=4.117, nll_loss=2.494, ppl=5.63, wps=459695, ups=1.06, wpb=432888, bsz=16634.3, num_updates=21800, lr=0.000428353, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=21604 epoch 013: 1566 / 1689 loss=4.117, nll_loss=2.494, ppl=5.63, wps=459695, ups=1.06, wpb=432888, bsz=16634.3, num_updates=21800, lr=0.000428353, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=21604 epoch 013: 1566 / 1689 loss=4.117, nll_loss=2.494, ppl=5.63, wps=459695, ups=1.06, wpb=432888, bsz=16634.3, num_updates=21800, lr=0.000428353, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=21604 epoch 013: 1566 / 1689 loss=4.117, nll_loss=2.494, ppl=5.63, wps=459695, ups=1.06, wpb=432888, bsz=16634.3, num_updates=21800, lr=0.000428353, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=21.6, wall=21604 epoch 013: 1666 / 1689 loss=4.11, nll_loss=2.487, ppl=5.6, wps=459876, ups=1.06, wpb=432969, bsz=16319, num_updates=21900, lr=0.000427374, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=21698 epoch 013: 1666 / 1689 loss=4.11, nll_loss=2.487, ppl=5.6, wps=459876, ups=1.06, wpb=432969, bsz=16319, num_updates=21900, lr=0.000427374, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=21698 epoch 013: 1666 / 1689 loss=4.11, nll_loss=2.487, ppl=5.6, wps=459876, ups=1.06, wpb=432969, bsz=16319, num_updates=21900, lr=0.000427374, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=21698 epoch 013: 1666 / 1689 loss=4.11, nll_loss=2.487, ppl=5.6, wps=459876, ups=1.06, wpb=432969, bsz=16319, num_updates=21900, lr=0.000427374, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=21698 epoch 013: 1666 / 1689 loss=4.11, nll_loss=2.487, ppl=5.6, wps=459876, ups=1.06, wpb=432969, bsz=16319, num_updates=21900, lr=0.000427374, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=21698 epoch 013: 1666 / 1689 loss=4.11, nll_loss=2.487, ppl=5.6, wps=459876, ups=1.06, wpb=432969, bsz=16319, num_updates=21900, lr=0.000427374, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=21698 epoch 013: 1666 / 1689 loss=4.11, nll_loss=2.487, ppl=5.6, wps=459876, ups=1.06, wpb=432969, bsz=16319, num_updates=21900, lr=0.000427374, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=21698 epoch 013: 1666 / 1689 loss=4.11, nll_loss=2.487, ppl=5.6, wps=459876, ups=1.06, wpb=432969, bsz=16319, num_updates=21900, lr=0.000427374, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=21698 epoch 013: 1666 / 1689 loss=4.11, nll_loss=2.487, ppl=5.6, wps=459876, ups=1.06, wpb=432969, bsz=16319, num_updates=21900, lr=0.000427374, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=21698 epoch 013: 1666 / 1689 loss=4.11, nll_loss=2.487, ppl=5.6, wps=459876, ups=1.06, wpb=432969, bsz=16319, num_updates=21900, lr=0.000427374, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=21698 epoch 013: 1666 / 1689 loss=4.11, nll_loss=2.487, ppl=5.6, wps=459876, ups=1.06, wpb=432969, bsz=16319, num_updates=21900, lr=0.000427374, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=21698 epoch 013: 1666 / 1689 loss=4.11, nll_loss=2.487, ppl=5.6, wps=459876, ups=1.06, wpb=432969, bsz=16319, num_updates=21900, lr=0.000427374, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=21698 epoch 013: 1666 / 1689 loss=4.11, nll_loss=2.487, ppl=5.6, wps=459876, ups=1.06, wpb=432969, bsz=16319, num_updates=21900, lr=0.000427374, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=21698 end of epoch 13 (average epoch stats below) epoch 013 | loss 4.104 | nll_loss 2.479 | ppl 5.57 | wps 455011 | ups 1.05 | wpb 433522 | bsz 16507.9 | num_updates 21923 | lr 0.00042715 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.9 | wall 21719 epoch 013 | loss 4.104 | nll_loss 2.479 | ppl 5.57 | wps 455011 | ups 1.05 | wpb 433522 | bsz 16507.9 | num_updates 21923 | lr 0.00042715 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.9 | wall 21719 epoch 013 | loss 4.104 | nll_loss 2.479 | ppl 5.57 | wps 455011 | ups 1.05 | wpb 433522 | bsz 16507.9 | num_updates 21923 | lr 0.00042715 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.9 | wall 21719 epoch 013 | loss 4.104 | nll_loss 2.479 | ppl 5.57 | wps 455011 | ups 1.05 | wpb 433522 | bsz 16507.9 | num_updates 21923 | lr 0.00042715 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.9 | wall 21719 epoch 013 | loss 4.104 | nll_loss 2.479 | ppl 5.57 | wps 455011 | ups 1.05 | wpb 433522 | bsz 16507.9 | num_updates 21923 | lr 0.00042715 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.9 | wall 21719 epoch 013 | loss 4.104 | nll_loss 2.479 | ppl 5.57 | wps 455011 | ups 1.05 | wpb 433522 | bsz 16507.9 | num_updates 21923 | lr 0.00042715 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.9 | wall 21719 epoch 013 | loss 4.104 | nll_loss 2.479 | ppl 5.57 | wps 455011 | ups 1.05 | wpb 433522 | bsz 16507.9 | num_updates 21923 | lr 0.00042715 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.9 | wall 21719 epoch 013 | loss 4.104 | nll_loss 2.479 | ppl 5.57 | wps 455011 | ups 1.05 | wpb 433522 | bsz 16507.9 | num_updates 21923 | lr 0.00042715 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.9 | wall 21719 epoch 013 | loss 4.104 | nll_loss 2.479 | ppl 5.57 | wps 455011 | ups 1.05 | wpb 433522 | bsz 16507.9 | num_updates 21923 | lr 0.00042715 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.9 | wall 21719 epoch 013 | loss 4.104 | nll_loss 2.479 | ppl 5.57 | wps 455011 | ups 1.05 | wpb 433522 | bsz 16507.9 | num_updates 21923 | lr 0.00042715 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.9 | wall 21719 epoch 013 | loss 4.104 | nll_loss 2.479 | ppl 5.57 | wps 455011 | ups 1.05 | wpb 433522 | bsz 16507.9 | num_updates 21923 | lr 0.00042715 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.9 | wall 21719 epoch 013 | loss 4.104 | nll_loss 2.479 | ppl 5.57 | wps 455011 | ups 1.05 | wpb 433522 | bsz 16507.9 | num_updates 21923 | lr 0.00042715 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.9 | wall 21719 epoch 013 | loss 4.104 | nll_loss 2.479 | ppl 5.57 | wps 455011 | ups 1.05 | wpb 433522 | bsz 16507.9 | num_updates 21923 | lr 0.00042715 | gnorm 0.22 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.9 | wall 21719 Start iterating over samples epoch 014: 77 / 1689 loss=4.073, nll_loss=2.443, ppl=5.44, wps=450967, ups=1.05, wpb=428831, bsz=16329, num_updates=22000, lr=0.000426401, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=21793 epoch 014: 77 / 1689 loss=4.073, nll_loss=2.443, ppl=5.44, wps=450967, ups=1.05, wpb=428831, bsz=16329, num_updates=22000, lr=0.000426401, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=21793 epoch 014: 77 / 1689 loss=4.073, nll_loss=2.443, ppl=5.44, wps=450967, ups=1.05, wpb=428831, bsz=16329, num_updates=22000, lr=0.000426401, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=21793 epoch 014: 77 / 1689 loss=4.073, nll_loss=2.443, ppl=5.44, wps=450967, ups=1.05, wpb=428831, bsz=16329, num_updates=22000, lr=0.000426401, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=21793 epoch 014: 77 / 1689 loss=4.073, nll_loss=2.443, ppl=5.44, wps=450967, ups=1.05, wpb=428831, bsz=16329, num_updates=22000, lr=0.000426401, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=21793 epoch 014: 77 / 1689 loss=4.073, nll_loss=2.443, ppl=5.44, wps=450967, ups=1.05, wpb=428831, bsz=16329, num_updates=22000, lr=0.000426401, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=21793 epoch 014: 77 / 1689 loss=4.073, nll_loss=2.443, ppl=5.44, wps=450967, ups=1.05, wpb=428831, bsz=16329, num_updates=22000, lr=0.000426401, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=21793 epoch 014: 77 / 1689 loss=4.073, nll_loss=2.443, ppl=5.44, wps=450967, ups=1.05, wpb=428831, bsz=16329, num_updates=22000, lr=0.000426401, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=21793 epoch 014: 77 / 1689 loss=4.073, nll_loss=2.443, ppl=5.44, wps=450967, ups=1.05, wpb=428831, bsz=16329, num_updates=22000, lr=0.000426401, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=21793 epoch 014: 77 / 1689 loss=4.073, nll_loss=2.443, ppl=5.44, wps=450967, ups=1.05, wpb=428831, bsz=16329, num_updates=22000, lr=0.000426401, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=21793 epoch 014: 77 / 1689 loss=4.073, nll_loss=2.443, ppl=5.44, wps=450967, ups=1.05, wpb=428831, bsz=16329, num_updates=22000, lr=0.000426401, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=21793 epoch 014: 77 / 1689 loss=4.073, nll_loss=2.443, ppl=5.44, wps=450967, ups=1.05, wpb=428831, bsz=16329, num_updates=22000, lr=0.000426401, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=21793 epoch 014: 77 / 1689 loss=4.073, nll_loss=2.443, ppl=5.44, wps=450967, ups=1.05, wpb=428831, bsz=16329, num_updates=22000, lr=0.000426401, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=21793 epoch 014: 77 / 1689 loss=4.073, nll_loss=2.443, ppl=5.44, wps=450967, ups=1.05, wpb=428831, bsz=16329, num_updates=22000, lr=0.000426401, gnorm=0.219, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=21793 begin validation on "valid" subset epoch 014 | valid on 'valid' subset | loss 4.279 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.273 epoch 014 | valid on 'valid' subset | loss 4.279 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.273 epoch 014 | valid on 'valid' subset | loss 4.279 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.273 epoch 014 | valid on 'valid' subset | loss 4.279 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.273 epoch 014 | valid on 'valid' subset | loss 4.279 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.273 epoch 014 | valid on 'valid' subset | loss 4.279 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.273 epoch 014 | valid on 'valid' subset | loss 4.279 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.273 epoch 014 | valid on 'valid' subset | loss 4.279 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.273 epoch 014 | valid on 'valid' subset | loss 4.279 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.273 epoch 014 | valid on 'valid' subset | loss 4.279 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.273 epoch 014 | valid on 'valid' subset | loss 4.279 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.273 epoch 014 | valid on 'valid' subset | loss 4.279 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.273 epoch 014 | valid on 'valid' subset | loss 4.279 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.273 epoch 014 | valid on 'valid' subset | loss 4.279 | nll_loss 2.66 | ppl 6.32 | wps 0 | wpb 42662 | bsz 2032 | num_updates 22000 | best_loss 4.273 epoch 014: 177 / 1689 loss=4.081, nll_loss=2.452, ppl=5.47, wps=407896, ups=0.94, wpb=435037, bsz=16524.1, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=21899 epoch 014: 177 / 1689 loss=4.081, nll_loss=2.452, ppl=5.47, wps=407896, ups=0.94, wpb=435037, bsz=16524.1, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=21899 epoch 014: 177 / 1689 loss=4.081, nll_loss=2.452, ppl=5.47, wps=407896, ups=0.94, wpb=435037, bsz=16524.1, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=21899 epoch 014: 177 / 1689 loss=4.081, nll_loss=2.452, ppl=5.47, wps=407896, ups=0.94, wpb=435037, bsz=16524.1, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=21899 epoch 014: 177 / 1689 loss=4.081, nll_loss=2.452, ppl=5.47, wps=407896, ups=0.94, wpb=435037, bsz=16524.1, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=21899 epoch 014: 177 / 1689 loss=4.081, nll_loss=2.452, ppl=5.47, wps=407896, ups=0.94, wpb=435037, bsz=16524.1, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=21899 epoch 014: 177 / 1689 loss=4.081, nll_loss=2.452, ppl=5.47, wps=407896, ups=0.94, wpb=435037, bsz=16524.1, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=21899 epoch 014: 177 / 1689 loss=4.081, nll_loss=2.452, ppl=5.47, wps=407896, ups=0.94, wpb=435037, bsz=16524.1, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=21899 epoch 014: 177 / 1689 loss=4.081, nll_loss=2.452, ppl=5.47, wps=407896, ups=0.94, wpb=435037, bsz=16524.1, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=21899 epoch 014: 177 / 1689 loss=4.081, nll_loss=2.452, ppl=5.47, wps=407896, ups=0.94, wpb=435037, bsz=16524.1, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=21899 epoch 014: 177 / 1689 loss=4.081, nll_loss=2.452, ppl=5.47, wps=407896, ups=0.94, wpb=435037, bsz=16524.1, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=21899 epoch 014: 177 / 1689 loss=4.081, nll_loss=2.452, ppl=5.47, wps=407896, ups=0.94, wpb=435037, bsz=16524.1, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=21899 epoch 014: 177 / 1689 loss=4.081, nll_loss=2.452, ppl=5.47, wps=407896, ups=0.94, wpb=435037, bsz=16524.1, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=21899 epoch 014: 177 / 1689 loss=4.081, nll_loss=2.452, ppl=5.47, wps=407896, ups=0.94, wpb=435037, bsz=16524.1, num_updates=22100, lr=0.000425436, gnorm=0.219, clip=0, loss_scale=2, train_wall=92, gb_free=18.8, wall=21899 epoch 014: 278 / 1689 loss=4.071, nll_loss=2.441, ppl=5.43, wps=456569, ups=1.05, wpb=432813, bsz=16439.8, num_updates=22200, lr=0.000424476, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21994 epoch 014: 278 / 1689 loss=4.071, nll_loss=2.441, ppl=5.43, wps=456569, ups=1.05, wpb=432813, bsz=16439.8, num_updates=22200, lr=0.000424476, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21994 epoch 014: 278 / 1689 loss=4.071, nll_loss=2.441, ppl=5.43, wps=456569, ups=1.05, wpb=432813, bsz=16439.8, num_updates=22200, lr=0.000424476, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21994 epoch 014: 278 / 1689 loss=4.071, nll_loss=2.441, ppl=5.43, wps=456569, ups=1.05, wpb=432813, bsz=16439.8, num_updates=22200, lr=0.000424476, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21994 epoch 014: 278 / 1689 loss=4.071, nll_loss=2.441, ppl=5.43, wps=456569, ups=1.05, wpb=432813, bsz=16439.8, num_updates=22200, lr=0.000424476, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21994 epoch 014: 278 / 1689 loss=4.071, nll_loss=2.441, ppl=5.43, wps=456569, ups=1.05, wpb=432813, bsz=16439.8, num_updates=22200, lr=0.000424476, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21994 epoch 014: 278 / 1689 loss=4.071, nll_loss=2.441, ppl=5.43, wps=456569, ups=1.05, wpb=432813, bsz=16439.8, num_updates=22200, lr=0.000424476, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21994 epoch 014: 278 / 1689 loss=4.071, nll_loss=2.441, ppl=5.43, wps=456569, ups=1.05, wpb=432813, bsz=16439.8, num_updates=22200, lr=0.000424476, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21994 epoch 014: 278 / 1689 loss=4.071, nll_loss=2.441, ppl=5.43, wps=456569, ups=1.05, wpb=432813, bsz=16439.8, num_updates=22200, lr=0.000424476, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21994 epoch 014: 278 / 1689 loss=4.071, nll_loss=2.441, ppl=5.43, wps=456569, ups=1.05, wpb=432813, bsz=16439.8, num_updates=22200, lr=0.000424476, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21994 epoch 014: 278 / 1689 loss=4.071, nll_loss=2.441, ppl=5.43, wps=456569, ups=1.05, wpb=432813, bsz=16439.8, num_updates=22200, lr=0.000424476, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21994 epoch 014: 278 / 1689 loss=4.071, nll_loss=2.441, ppl=5.43, wps=456569, ups=1.05, wpb=432813, bsz=16439.8, num_updates=22200, lr=0.000424476, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21994 epoch 014: 278 / 1689 loss=4.071, nll_loss=2.441, ppl=5.43, wps=456569, ups=1.05, wpb=432813, bsz=16439.8, num_updates=22200, lr=0.000424476, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21994 epoch 014: 278 / 1689 loss=4.071, nll_loss=2.441, ppl=5.43, wps=456569, ups=1.05, wpb=432813, bsz=16439.8, num_updates=22200, lr=0.000424476, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=21994 epoch 014: 378 / 1689 loss=4.079, nll_loss=2.45, ppl=5.46, wps=460354, ups=1.07, wpb=432238, bsz=16705.5, num_updates=22300, lr=0.000423524, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=22088 epoch 014: 378 / 1689 loss=4.079, nll_loss=2.45, ppl=5.46, wps=460354, ups=1.07, wpb=432238, bsz=16705.5, num_updates=22300, lr=0.000423524, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=22088 epoch 014: 378 / 1689 loss=4.079, nll_loss=2.45, ppl=5.46, wps=460354, ups=1.07, wpb=432238, bsz=16705.5, num_updates=22300, lr=0.000423524, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=22088 epoch 014: 378 / 1689 loss=4.079, nll_loss=2.45, ppl=5.46, wps=460354, ups=1.07, wpb=432238, bsz=16705.5, num_updates=22300, lr=0.000423524, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=22088 epoch 014: 378 / 1689 loss=4.079, nll_loss=2.45, ppl=5.46, wps=460354, ups=1.07, wpb=432238, bsz=16705.5, num_updates=22300, lr=0.000423524, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=22088 epoch 014: 378 / 1689 loss=4.079, nll_loss=2.45, ppl=5.46, wps=460354, ups=1.07, wpb=432238, bsz=16705.5, num_updates=22300, lr=0.000423524, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=22088 epoch 014: 378 / 1689 loss=4.079, nll_loss=2.45, ppl=5.46, wps=460354, ups=1.07, wpb=432238, bsz=16705.5, num_updates=22300, lr=0.000423524, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=22088 epoch 014: 378 / 1689 loss=4.079, nll_loss=2.45, ppl=5.46, wps=460354, ups=1.07, wpb=432238, bsz=16705.5, num_updates=22300, lr=0.000423524, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=22088 epoch 014: 378 / 1689 loss=4.079, nll_loss=2.45, ppl=5.46, wps=460354, ups=1.07, wpb=432238, bsz=16705.5, num_updates=22300, lr=0.000423524, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=22088 epoch 014: 378 / 1689 loss=4.079, nll_loss=2.45, ppl=5.46, wps=460354, ups=1.07, wpb=432238, bsz=16705.5, num_updates=22300, lr=0.000423524, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=22088 epoch 014: 378 / 1689 loss=4.079, nll_loss=2.45, ppl=5.46, wps=460354, ups=1.07, wpb=432238, bsz=16705.5, num_updates=22300, lr=0.000423524, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=22088 epoch 014: 378 / 1689 loss=4.079, nll_loss=2.45, ppl=5.46, wps=460354, ups=1.07, wpb=432238, bsz=16705.5, num_updates=22300, lr=0.000423524, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=22088 epoch 014: 378 / 1689 loss=4.079, nll_loss=2.45, ppl=5.46, wps=460354, ups=1.07, wpb=432238, bsz=16705.5, num_updates=22300, lr=0.000423524, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=22088 epoch 014: 378 / 1689 loss=4.079, nll_loss=2.45, ppl=5.46, wps=460354, ups=1.07, wpb=432238, bsz=16705.5, num_updates=22300, lr=0.000423524, gnorm=0.224, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=22088 epoch 014: 478 / 1689 loss=4.08, nll_loss=2.451, ppl=5.47, wps=464929, ups=1.06, wpb=437775, bsz=16459.4, num_updates=22400, lr=0.000422577, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=22182 epoch 014: 478 / 1689 loss=4.08, nll_loss=2.451, ppl=5.47, wps=464929, ups=1.06, wpb=437775, bsz=16459.4, num_updates=22400, lr=0.000422577, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=22182 epoch 014: 478 / 1689 loss=4.08, nll_loss=2.451, ppl=5.47, wps=464929, ups=1.06, wpb=437775, bsz=16459.4, num_updates=22400, lr=0.000422577, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=22182 epoch 014: 478 / 1689 loss=4.08, nll_loss=2.451, ppl=5.47, wps=464929, ups=1.06, wpb=437775, bsz=16459.4, num_updates=22400, lr=0.000422577, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=22182 epoch 014: 478 / 1689 loss=4.08, nll_loss=2.451, ppl=5.47, wps=464929, ups=1.06, wpb=437775, bsz=16459.4, num_updates=22400, lr=0.000422577, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=22182 epoch 014: 478 / 1689 loss=4.08, nll_loss=2.451, ppl=5.47, wps=464929, ups=1.06, wpb=437775, bsz=16459.4, num_updates=22400, lr=0.000422577, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=22182 epoch 014: 478 / 1689 loss=4.08, nll_loss=2.451, ppl=5.47, wps=464929, ups=1.06, wpb=437775, bsz=16459.4, num_updates=22400, lr=0.000422577, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=22182 epoch 014: 478 / 1689 loss=4.08, nll_loss=2.451, ppl=5.47, wps=464929, ups=1.06, wpb=437775, bsz=16459.4, num_updates=22400, lr=0.000422577, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=22182 epoch 014: 478 / 1689 loss=4.08, nll_loss=2.451, ppl=5.47, wps=464929, ups=1.06, wpb=437775, bsz=16459.4, num_updates=22400, lr=0.000422577, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=22182 epoch 014: 478 / 1689 loss=4.08, nll_loss=2.451, ppl=5.47, wps=464929, ups=1.06, wpb=437775, bsz=16459.4, num_updates=22400, lr=0.000422577, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=22182 epoch 014: 478 / 1689 loss=4.08, nll_loss=2.451, ppl=5.47, wps=464929, ups=1.06, wpb=437775, bsz=16459.4, num_updates=22400, lr=0.000422577, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=22182 epoch 014: 478 / 1689 loss=4.08, nll_loss=2.451, ppl=5.47, wps=464929, ups=1.06, wpb=437775, bsz=16459.4, num_updates=22400, lr=0.000422577, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=22182 epoch 014: 478 / 1689 loss=4.08, nll_loss=2.451, ppl=5.47, wps=464929, ups=1.06, wpb=437775, bsz=16459.4, num_updates=22400, lr=0.000422577, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=22182 epoch 014: 478 / 1689 loss=4.08, nll_loss=2.451, ppl=5.47, wps=464929, ups=1.06, wpb=437775, bsz=16459.4, num_updates=22400, lr=0.000422577, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=20.5, wall=22182 epoch 014: 578 / 1689 loss=4.083, nll_loss=2.455, ppl=5.48, wps=458096, ups=1.06, wpb=433987, bsz=16521.4, num_updates=22500, lr=0.000421637, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=22277 epoch 014: 578 / 1689 loss=4.083, nll_loss=2.455, ppl=5.48, wps=458096, ups=1.06, wpb=433987, bsz=16521.4, num_updates=22500, lr=0.000421637, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=22277 epoch 014: 578 / 1689 loss=4.083, nll_loss=2.455, ppl=5.48, wps=458096, ups=1.06, wpb=433987, bsz=16521.4, num_updates=22500, lr=0.000421637, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=22277 epoch 014: 578 / 1689 loss=4.083, nll_loss=2.455, ppl=5.48, wps=458096, ups=1.06, wpb=433987, bsz=16521.4, num_updates=22500, lr=0.000421637, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=22277 epoch 014: 578 / 1689 loss=4.083, nll_loss=2.455, ppl=5.48, wps=458096, ups=1.06, wpb=433987, bsz=16521.4, num_updates=22500, lr=0.000421637, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=22277 epoch 014: 578 / 1689 loss=4.083, nll_loss=2.455, ppl=5.48, wps=458096, ups=1.06, wpb=433987, bsz=16521.4, num_updates=22500, lr=0.000421637, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=22277 epoch 014: 578 / 1689 loss=4.083, nll_loss=2.455, ppl=5.48, wps=458096, ups=1.06, wpb=433987, bsz=16521.4, num_updates=22500, lr=0.000421637, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=22277 epoch 014: 578 / 1689 loss=4.083, nll_loss=2.455, ppl=5.48, wps=458096, ups=1.06, wpb=433987, bsz=16521.4, num_updates=22500, lr=0.000421637, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=22277 epoch 014: 578 / 1689 loss=4.083, nll_loss=2.455, ppl=5.48, wps=458096, ups=1.06, wpb=433987, bsz=16521.4, num_updates=22500, lr=0.000421637, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=22277 epoch 014: 578 / 1689 loss=4.083, nll_loss=2.455, ppl=5.48, wps=458096, ups=1.06, wpb=433987, bsz=16521.4, num_updates=22500, lr=0.000421637, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=22277 epoch 014: 578 / 1689 loss=4.083, nll_loss=2.455, ppl=5.48, wps=458096, ups=1.06, wpb=433987, bsz=16521.4, num_updates=22500, lr=0.000421637, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=22277 epoch 014: 578 / 1689 loss=4.083, nll_loss=2.455, ppl=5.48, wps=458096, ups=1.06, wpb=433987, bsz=16521.4, num_updates=22500, lr=0.000421637, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=22277 epoch 014: 578 / 1689 loss=4.083, nll_loss=2.455, ppl=5.48, wps=458096, ups=1.06, wpb=433987, bsz=16521.4, num_updates=22500, lr=0.000421637, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=22277 epoch 014: 578 / 1689 loss=4.083, nll_loss=2.455, ppl=5.48, wps=458096, ups=1.06, wpb=433987, bsz=16521.4, num_updates=22500, lr=0.000421637, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=22277 epoch 014: 678 / 1689 loss=4.086, nll_loss=2.458, ppl=5.5, wps=460136, ups=1.06, wpb=432246, bsz=16344.2, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=22371 epoch 014: 678 / 1689 loss=4.086, nll_loss=2.458, ppl=5.5, wps=460136, ups=1.06, wpb=432246, bsz=16344.2, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=22371 epoch 014: 678 / 1689 loss=4.086, nll_loss=2.458, ppl=5.5, wps=460136, ups=1.06, wpb=432246, bsz=16344.2, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=22371 epoch 014: 678 / 1689 loss=4.086, nll_loss=2.458, ppl=5.5, wps=460136, ups=1.06, wpb=432246, bsz=16344.2, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=22371 epoch 014: 678 / 1689 loss=4.086, nll_loss=2.458, ppl=5.5, wps=460136, ups=1.06, wpb=432246, bsz=16344.2, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=22371 epoch 014: 678 / 1689 loss=4.086, nll_loss=2.458, ppl=5.5, wps=460136, ups=1.06, wpb=432246, bsz=16344.2, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=22371 epoch 014: 678 / 1689 loss=4.086, nll_loss=2.458, ppl=5.5, wps=460136, ups=1.06, wpb=432246, bsz=16344.2, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=22371 epoch 014: 678 / 1689 loss=4.086, nll_loss=2.458, ppl=5.5, wps=460136, ups=1.06, wpb=432246, bsz=16344.2, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=22371 epoch 014: 678 / 1689 loss=4.086, nll_loss=2.458, ppl=5.5, wps=460136, ups=1.06, wpb=432246, bsz=16344.2, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=22371 epoch 014: 678 / 1689 loss=4.086, nll_loss=2.458, ppl=5.5, wps=460136, ups=1.06, wpb=432246, bsz=16344.2, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=22371 epoch 014: 678 / 1689 loss=4.086, nll_loss=2.458, ppl=5.5, wps=460136, ups=1.06, wpb=432246, bsz=16344.2, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=22371 epoch 014: 678 / 1689 loss=4.086, nll_loss=2.458, ppl=5.5, wps=460136, ups=1.06, wpb=432246, bsz=16344.2, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=22371 epoch 014: 678 / 1689 loss=4.086, nll_loss=2.458, ppl=5.5, wps=460136, ups=1.06, wpb=432246, bsz=16344.2, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=22371 epoch 014: 678 / 1689 loss=4.086, nll_loss=2.458, ppl=5.5, wps=460136, ups=1.06, wpb=432246, bsz=16344.2, num_updates=22600, lr=0.000420703, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=20, wall=22371 epoch 014: 778 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=461625, ups=1.06, wpb=436096, bsz=16427.9, num_updates=22700, lr=0.000419775, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=22465 epoch 014: 778 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=461625, ups=1.06, wpb=436096, bsz=16427.9, num_updates=22700, lr=0.000419775, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=22465 epoch 014: 778 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=461625, ups=1.06, wpb=436096, bsz=16427.9, num_updates=22700, lr=0.000419775, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=22465 epoch 014: 778 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=461625, ups=1.06, wpb=436096, bsz=16427.9, num_updates=22700, lr=0.000419775, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=22465 epoch 014: 778 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=461625, ups=1.06, wpb=436096, bsz=16427.9, num_updates=22700, lr=0.000419775, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=22465 epoch 014: 778 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=461625, ups=1.06, wpb=436096, bsz=16427.9, num_updates=22700, lr=0.000419775, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=22465 epoch 014: 778 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=461625, ups=1.06, wpb=436096, bsz=16427.9, num_updates=22700, lr=0.000419775, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=22465 epoch 014: 778 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=461625, ups=1.06, wpb=436096, bsz=16427.9, num_updates=22700, lr=0.000419775, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=22465 epoch 014: 778 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=461625, ups=1.06, wpb=436096, bsz=16427.9, num_updates=22700, lr=0.000419775, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=22465 epoch 014: 778 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=461625, ups=1.06, wpb=436096, bsz=16427.9, num_updates=22700, lr=0.000419775, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=22465 epoch 014: 778 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=461625, ups=1.06, wpb=436096, bsz=16427.9, num_updates=22700, lr=0.000419775, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=22465 epoch 014: 778 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=461625, ups=1.06, wpb=436096, bsz=16427.9, num_updates=22700, lr=0.000419775, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=22465 epoch 014: 778 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=461625, ups=1.06, wpb=436096, bsz=16427.9, num_updates=22700, lr=0.000419775, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=22465 epoch 014: 778 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=461625, ups=1.06, wpb=436096, bsz=16427.9, num_updates=22700, lr=0.000419775, gnorm=0.232, clip=0, loss_scale=2, train_wall=92, gb_free=19.6, wall=22465 epoch 014: 878 / 1689 loss=4.095, nll_loss=2.469, ppl=5.54, wps=459776, ups=1.06, wpb=434759, bsz=16651.8, num_updates=22800, lr=0.000418854, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=22560 epoch 014: 878 / 1689 loss=4.095, nll_loss=2.469, ppl=5.54, wps=459776, ups=1.06, wpb=434759, bsz=16651.8, num_updates=22800, lr=0.000418854, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=22560 epoch 014: 878 / 1689 loss=4.095, nll_loss=2.469, ppl=5.54, wps=459776, ups=1.06, wpb=434759, bsz=16651.8, num_updates=22800, lr=0.000418854, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=22560 epoch 014: 878 / 1689 loss=4.095, nll_loss=2.469, ppl=5.54, wps=459776, ups=1.06, wpb=434759, bsz=16651.8, num_updates=22800, lr=0.000418854, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=22560 epoch 014: 878 / 1689 loss=4.095, nll_loss=2.469, ppl=5.54, wps=459776, ups=1.06, wpb=434759, bsz=16651.8, num_updates=22800, lr=0.000418854, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=22560 epoch 014: 878 / 1689 loss=4.095, nll_loss=2.469, ppl=5.54, wps=459776, ups=1.06, wpb=434759, bsz=16651.8, num_updates=22800, lr=0.000418854, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=22560 epoch 014: 878 / 1689 loss=4.095, nll_loss=2.469, ppl=5.54, wps=459776, ups=1.06, wpb=434759, bsz=16651.8, num_updates=22800, lr=0.000418854, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=22560 epoch 014: 878 / 1689 loss=4.095, nll_loss=2.469, ppl=5.54, wps=459776, ups=1.06, wpb=434759, bsz=16651.8, num_updates=22800, lr=0.000418854, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=22560 epoch 014: 878 / 1689 loss=4.095, nll_loss=2.469, ppl=5.54, wps=459776, ups=1.06, wpb=434759, bsz=16651.8, num_updates=22800, lr=0.000418854, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=22560 epoch 014: 878 / 1689 loss=4.095, nll_loss=2.469, ppl=5.54, wps=459776, ups=1.06, wpb=434759, bsz=16651.8, num_updates=22800, lr=0.000418854, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=22560 epoch 014: 878 / 1689 loss=4.095, nll_loss=2.469, ppl=5.54, wps=459776, ups=1.06, wpb=434759, bsz=16651.8, num_updates=22800, lr=0.000418854, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=22560 epoch 014: 878 / 1689 loss=4.095, nll_loss=2.469, ppl=5.54, wps=459776, ups=1.06, wpb=434759, bsz=16651.8, num_updates=22800, lr=0.000418854, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=22560 epoch 014: 878 / 1689 loss=4.095, nll_loss=2.469, ppl=5.54, wps=459776, ups=1.06, wpb=434759, bsz=16651.8, num_updates=22800, lr=0.000418854, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=22560 epoch 014: 878 / 1689 loss=4.095, nll_loss=2.469, ppl=5.54, wps=459776, ups=1.06, wpb=434759, bsz=16651.8, num_updates=22800, lr=0.000418854, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=22560 epoch 014: 978 / 1689 loss=4.084, nll_loss=2.457, ppl=5.49, wps=451188, ups=1.05, wpb=429452, bsz=16831, num_updates=22900, lr=0.000417938, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=22655 epoch 014: 978 / 1689 loss=4.084, nll_loss=2.457, ppl=5.49, wps=451188, ups=1.05, wpb=429452, bsz=16831, num_updates=22900, lr=0.000417938, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=22655 epoch 014: 978 / 1689 loss=4.084, nll_loss=2.457, ppl=5.49, wps=451188, ups=1.05, wpb=429452, bsz=16831, num_updates=22900, lr=0.000417938, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=22655 epoch 014: 978 / 1689 loss=4.084, nll_loss=2.457, ppl=5.49, wps=451188, ups=1.05, wpb=429452, bsz=16831, num_updates=22900, lr=0.000417938, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=22655 epoch 014: 978 / 1689 loss=4.084, nll_loss=2.457, ppl=5.49, wps=451188, ups=1.05, wpb=429452, bsz=16831, num_updates=22900, lr=0.000417938, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=22655 epoch 014: 978 / 1689 loss=4.084, nll_loss=2.457, ppl=5.49, wps=451188, ups=1.05, wpb=429452, bsz=16831, num_updates=22900, lr=0.000417938, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=22655 epoch 014: 978 / 1689 loss=4.084, nll_loss=2.457, ppl=5.49, wps=451188, ups=1.05, wpb=429452, bsz=16831, num_updates=22900, lr=0.000417938, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=22655 epoch 014: 978 / 1689 loss=4.084, nll_loss=2.457, ppl=5.49, wps=451188, ups=1.05, wpb=429452, bsz=16831, num_updates=22900, lr=0.000417938, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=22655 epoch 014: 978 / 1689 loss=4.084, nll_loss=2.457, ppl=5.49, wps=451188, ups=1.05, wpb=429452, bsz=16831, num_updates=22900, lr=0.000417938, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=22655 epoch 014: 978 / 1689 loss=4.084, nll_loss=2.457, ppl=5.49, wps=451188, ups=1.05, wpb=429452, bsz=16831, num_updates=22900, lr=0.000417938, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=22655 epoch 014: 978 / 1689 loss=4.084, nll_loss=2.457, ppl=5.49, wps=451188, ups=1.05, wpb=429452, bsz=16831, num_updates=22900, lr=0.000417938, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=22655 epoch 014: 978 / 1689 loss=4.084, nll_loss=2.457, ppl=5.49, wps=451188, ups=1.05, wpb=429452, bsz=16831, num_updates=22900, lr=0.000417938, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=22655 epoch 014: 978 / 1689 loss=4.084, nll_loss=2.457, ppl=5.49, wps=451188, ups=1.05, wpb=429452, bsz=16831, num_updates=22900, lr=0.000417938, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=22655 epoch 014: 978 / 1689 loss=4.084, nll_loss=2.457, ppl=5.49, wps=451188, ups=1.05, wpb=429452, bsz=16831, num_updates=22900, lr=0.000417938, gnorm=0.21, clip=0, loss_scale=2, train_wall=93, gb_free=19.9, wall=22655 epoch 014: 1079 / 1689 loss=4.09, nll_loss=2.464, ppl=5.52, wps=454764, ups=1.04, wpb=435804, bsz=16574.6, num_updates=23000, lr=0.000417029, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=22751 epoch 014: 1079 / 1689 loss=4.09, nll_loss=2.464, ppl=5.52, wps=454764, ups=1.04, wpb=435804, bsz=16574.6, num_updates=23000, lr=0.000417029, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=22751 epoch 014: 1079 / 1689 loss=4.09, nll_loss=2.464, ppl=5.52, wps=454764, ups=1.04, wpb=435804, bsz=16574.6, num_updates=23000, lr=0.000417029, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=22751 epoch 014: 1079 / 1689 loss=4.09, nll_loss=2.464, ppl=5.52, wps=454764, ups=1.04, wpb=435804, bsz=16574.6, num_updates=23000, lr=0.000417029, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=22751 epoch 014: 1079 / 1689 loss=4.09, nll_loss=2.464, ppl=5.52, wps=454764, ups=1.04, wpb=435804, bsz=16574.6, num_updates=23000, lr=0.000417029, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=22751 epoch 014: 1079 / 1689 loss=4.09, nll_loss=2.464, ppl=5.52, wps=454764, ups=1.04, wpb=435804, bsz=16574.6, num_updates=23000, lr=0.000417029, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=22751 epoch 014: 1079 / 1689 loss=4.09, nll_loss=2.464, ppl=5.52, wps=454764, ups=1.04, wpb=435804, bsz=16574.6, num_updates=23000, lr=0.000417029, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=22751 epoch 014: 1079 / 1689 loss=4.09, nll_loss=2.464, ppl=5.52, wps=454764, ups=1.04, wpb=435804, bsz=16574.6, num_updates=23000, lr=0.000417029, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=22751 epoch 014: 1079 / 1689 loss=4.09, nll_loss=2.464, ppl=5.52, wps=454764, ups=1.04, wpb=435804, bsz=16574.6, num_updates=23000, lr=0.000417029, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=22751 epoch 014: 1079 / 1689 loss=4.09, nll_loss=2.464, ppl=5.52, wps=454764, ups=1.04, wpb=435804, bsz=16574.6, num_updates=23000, lr=0.000417029, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=22751 epoch 014: 1079 / 1689 loss=4.09, nll_loss=2.464, ppl=5.52, wps=454764, ups=1.04, wpb=435804, bsz=16574.6, num_updates=23000, lr=0.000417029, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=22751 epoch 014: 1079 / 1689 loss=4.09, nll_loss=2.464, ppl=5.52, wps=454764, ups=1.04, wpb=435804, bsz=16574.6, num_updates=23000, lr=0.000417029, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=22751 epoch 014: 1079 / 1689 loss=4.09, nll_loss=2.464, ppl=5.52, wps=454764, ups=1.04, wpb=435804, bsz=16574.6, num_updates=23000, lr=0.000417029, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=22751 epoch 014: 1079 / 1689 loss=4.09, nll_loss=2.464, ppl=5.52, wps=454764, ups=1.04, wpb=435804, bsz=16574.6, num_updates=23000, lr=0.000417029, gnorm=0.206, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=22751 begin validation on "valid" subset epoch 014 | valid on 'valid' subset | loss 4.271 | nll_loss 2.65 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.271 epoch 014 | valid on 'valid' subset | loss 4.271 | nll_loss 2.65 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.271 epoch 014 | valid on 'valid' subset | loss 4.271 | nll_loss 2.65 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.271 epoch 014 | valid on 'valid' subset | loss 4.271 | nll_loss 2.65 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.271 epoch 014 | valid on 'valid' subset | loss 4.271 | nll_loss 2.65 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.271 epoch 014 | valid on 'valid' subset | loss 4.271 | nll_loss 2.65 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.271 epoch 014 | valid on 'valid' subset | loss 4.271 | nll_loss 2.65 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.271 epoch 014 | valid on 'valid' subset | loss 4.271 | nll_loss 2.65 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.271 epoch 014 | valid on 'valid' subset | loss 4.271 | nll_loss 2.65 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.271 epoch 014 | valid on 'valid' subset | loss 4.271 | nll_loss 2.65 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.271 epoch 014 | valid on 'valid' subset | loss 4.271 | nll_loss 2.65 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.271 epoch 014 | valid on 'valid' subset | loss 4.271 | nll_loss 2.65 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.271 epoch 014 | valid on 'valid' subset | loss 4.271 | nll_loss 2.65 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.271 epoch 014 | valid on 'valid' subset | loss 4.271 | nll_loss 2.65 | ppl 6.28 | wps 0 | wpb 42662 | bsz 2032 | num_updates 23000 | best_loss 4.271 epoch 014: 1179 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=373799, ups=0.87, wpb=431214, bsz=16439.9, num_updates=23100, lr=0.000416125, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22866 epoch 014: 1179 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=373799, ups=0.87, wpb=431214, bsz=16439.9, num_updates=23100, lr=0.000416125, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22866 epoch 014: 1179 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=373799, ups=0.87, wpb=431214, bsz=16439.9, num_updates=23100, lr=0.000416125, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22866 epoch 014: 1179 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=373799, ups=0.87, wpb=431214, bsz=16439.9, num_updates=23100, lr=0.000416125, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22866 epoch 014: 1179 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=373799, ups=0.87, wpb=431214, bsz=16439.9, num_updates=23100, lr=0.000416125, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22866 epoch 014: 1179 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=373799, ups=0.87, wpb=431214, bsz=16439.9, num_updates=23100, lr=0.000416125, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22866 epoch 014: 1179 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=373799, ups=0.87, wpb=431214, bsz=16439.9, num_updates=23100, lr=0.000416125, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22866 epoch 014: 1179 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=373799, ups=0.87, wpb=431214, bsz=16439.9, num_updates=23100, lr=0.000416125, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22866 epoch 014: 1179 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=373799, ups=0.87, wpb=431214, bsz=16439.9, num_updates=23100, lr=0.000416125, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22866 epoch 014: 1179 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=373799, ups=0.87, wpb=431214, bsz=16439.9, num_updates=23100, lr=0.000416125, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22866 epoch 014: 1179 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=373799, ups=0.87, wpb=431214, bsz=16439.9, num_updates=23100, lr=0.000416125, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22866 epoch 014: 1179 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=373799, ups=0.87, wpb=431214, bsz=16439.9, num_updates=23100, lr=0.000416125, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22866 epoch 014: 1179 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=373799, ups=0.87, wpb=431214, bsz=16439.9, num_updates=23100, lr=0.000416125, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22866 epoch 014: 1179 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=373799, ups=0.87, wpb=431214, bsz=16439.9, num_updates=23100, lr=0.000416125, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=22866 epoch 014: 1280 / 1689 loss=4.097, nll_loss=2.472, ppl=5.55, wps=460055, ups=1.06, wpb=434171, bsz=16446.7, num_updates=23200, lr=0.000415227, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=22961 epoch 014: 1280 / 1689 loss=4.097, nll_loss=2.472, ppl=5.55, wps=460055, ups=1.06, wpb=434171, bsz=16446.7, num_updates=23200, lr=0.000415227, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=22961 epoch 014: 1280 / 1689 loss=4.097, nll_loss=2.472, ppl=5.55, wps=460055, ups=1.06, wpb=434171, bsz=16446.7, num_updates=23200, lr=0.000415227, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=22961 epoch 014: 1280 / 1689 loss=4.097, nll_loss=2.472, ppl=5.55, wps=460055, ups=1.06, wpb=434171, bsz=16446.7, num_updates=23200, lr=0.000415227, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=22961 epoch 014: 1280 / 1689 loss=4.097, nll_loss=2.472, ppl=5.55, wps=460055, ups=1.06, wpb=434171, bsz=16446.7, num_updates=23200, lr=0.000415227, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=22961 epoch 014: 1280 / 1689 loss=4.097, nll_loss=2.472, ppl=5.55, wps=460055, ups=1.06, wpb=434171, bsz=16446.7, num_updates=23200, lr=0.000415227, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=22961 epoch 014: 1280 / 1689 loss=4.097, nll_loss=2.472, ppl=5.55, wps=460055, ups=1.06, wpb=434171, bsz=16446.7, num_updates=23200, lr=0.000415227, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=22961 epoch 014: 1280 / 1689 loss=4.097, nll_loss=2.472, ppl=5.55, wps=460055, ups=1.06, wpb=434171, bsz=16446.7, num_updates=23200, lr=0.000415227, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=22961 epoch 014: 1280 / 1689 loss=4.097, nll_loss=2.472, ppl=5.55, wps=460055, ups=1.06, wpb=434171, bsz=16446.7, num_updates=23200, lr=0.000415227, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=22961 epoch 014: 1280 / 1689 loss=4.097, nll_loss=2.472, ppl=5.55, wps=460055, ups=1.06, wpb=434171, bsz=16446.7, num_updates=23200, lr=0.000415227, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=22961 epoch 014: 1280 / 1689 loss=4.097, nll_loss=2.472, ppl=5.55, wps=460055, ups=1.06, wpb=434171, bsz=16446.7, num_updates=23200, lr=0.000415227, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=22961 epoch 014: 1280 / 1689 loss=4.097, nll_loss=2.472, ppl=5.55, wps=460055, ups=1.06, wpb=434171, bsz=16446.7, num_updates=23200, lr=0.000415227, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=22961 epoch 014: 1280 / 1689 loss=4.097, nll_loss=2.472, ppl=5.55, wps=460055, ups=1.06, wpb=434171, bsz=16446.7, num_updates=23200, lr=0.000415227, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=22961 epoch 014: 1280 / 1689 loss=4.097, nll_loss=2.472, ppl=5.55, wps=460055, ups=1.06, wpb=434171, bsz=16446.7, num_updates=23200, lr=0.000415227, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=22961 epoch 014: 1380 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=463778, ups=1.07, wpb=435275, bsz=16298.9, num_updates=23300, lr=0.000414335, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23055 epoch 014: 1380 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=463778, ups=1.07, wpb=435275, bsz=16298.9, num_updates=23300, lr=0.000414335, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23055 epoch 014: 1380 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=463778, ups=1.07, wpb=435275, bsz=16298.9, num_updates=23300, lr=0.000414335, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23055 epoch 014: 1380 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=463778, ups=1.07, wpb=435275, bsz=16298.9, num_updates=23300, lr=0.000414335, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23055 epoch 014: 1380 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=463778, ups=1.07, wpb=435275, bsz=16298.9, num_updates=23300, lr=0.000414335, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23055 epoch 014: 1380 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=463778, ups=1.07, wpb=435275, bsz=16298.9, num_updates=23300, lr=0.000414335, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23055 epoch 014: 1380 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=463778, ups=1.07, wpb=435275, bsz=16298.9, num_updates=23300, lr=0.000414335, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23055 epoch 014: 1380 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=463778, ups=1.07, wpb=435275, bsz=16298.9, num_updates=23300, lr=0.000414335, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23055 epoch 014: 1380 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=463778, ups=1.07, wpb=435275, bsz=16298.9, num_updates=23300, lr=0.000414335, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23055 epoch 014: 1380 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=463778, ups=1.07, wpb=435275, bsz=16298.9, num_updates=23300, lr=0.000414335, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23055 epoch 014: 1380 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=463778, ups=1.07, wpb=435275, bsz=16298.9, num_updates=23300, lr=0.000414335, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23055 epoch 014: 1380 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=463778, ups=1.07, wpb=435275, bsz=16298.9, num_updates=23300, lr=0.000414335, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23055 epoch 014: 1380 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=463778, ups=1.07, wpb=435275, bsz=16298.9, num_updates=23300, lr=0.000414335, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23055 epoch 014: 1380 / 1689 loss=4.112, nll_loss=2.489, ppl=5.61, wps=463778, ups=1.07, wpb=435275, bsz=16298.9, num_updates=23300, lr=0.000414335, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=23055 epoch 014: 1480 / 1689 loss=4.101, nll_loss=2.476, ppl=5.56, wps=464690, ups=1.07, wpb=434770, bsz=16554.6, num_updates=23400, lr=0.000413449, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.7, wall=23148 epoch 014: 1480 / 1689 loss=4.101, nll_loss=2.476, ppl=5.56, wps=464690, ups=1.07, wpb=434770, bsz=16554.6, num_updates=23400, lr=0.000413449, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.7, wall=23148 epoch 014: 1480 / 1689 loss=4.101, nll_loss=2.476, ppl=5.56, wps=464690, ups=1.07, wpb=434770, bsz=16554.6, num_updates=23400, lr=0.000413449, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.7, wall=23148 epoch 014: 1480 / 1689 loss=4.101, nll_loss=2.476, ppl=5.56, wps=464690, ups=1.07, wpb=434770, bsz=16554.6, num_updates=23400, lr=0.000413449, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.7, wall=23148 epoch 014: 1480 / 1689 loss=4.101, nll_loss=2.476, ppl=5.56, wps=464690, ups=1.07, wpb=434770, bsz=16554.6, num_updates=23400, lr=0.000413449, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.7, wall=23148 epoch 014: 1480 / 1689 loss=4.101, nll_loss=2.476, ppl=5.56, wps=464690, ups=1.07, wpb=434770, bsz=16554.6, num_updates=23400, lr=0.000413449, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.7, wall=23148 epoch 014: 1480 / 1689 loss=4.101, nll_loss=2.476, ppl=5.56, wps=464690, ups=1.07, wpb=434770, bsz=16554.6, num_updates=23400, lr=0.000413449, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.7, wall=23148 epoch 014: 1480 / 1689 loss=4.101, nll_loss=2.476, ppl=5.56, wps=464690, ups=1.07, wpb=434770, bsz=16554.6, num_updates=23400, lr=0.000413449, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.7, wall=23148 epoch 014: 1480 / 1689 loss=4.101, nll_loss=2.476, ppl=5.56, wps=464690, ups=1.07, wpb=434770, bsz=16554.6, num_updates=23400, lr=0.000413449, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.7, wall=23148 epoch 014: 1480 / 1689 loss=4.101, nll_loss=2.476, ppl=5.56, wps=464690, ups=1.07, wpb=434770, bsz=16554.6, num_updates=23400, lr=0.000413449, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.7, wall=23148 epoch 014: 1480 / 1689 loss=4.101, nll_loss=2.476, ppl=5.56, wps=464690, ups=1.07, wpb=434770, bsz=16554.6, num_updates=23400, lr=0.000413449, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.7, wall=23148 epoch 014: 1480 / 1689 loss=4.101, nll_loss=2.476, ppl=5.56, wps=464690, ups=1.07, wpb=434770, bsz=16554.6, num_updates=23400, lr=0.000413449, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.7, wall=23148 epoch 014: 1480 / 1689 loss=4.101, nll_loss=2.476, ppl=5.56, wps=464690, ups=1.07, wpb=434770, bsz=16554.6, num_updates=23400, lr=0.000413449, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.7, wall=23148 epoch 014: 1480 / 1689 loss=4.101, nll_loss=2.476, ppl=5.56, wps=464690, ups=1.07, wpb=434770, bsz=16554.6, num_updates=23400, lr=0.000413449, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=17.7, wall=23148 epoch 014: 1580 / 1689 loss=4.111, nll_loss=2.488, ppl=5.61, wps=463724, ups=1.07, wpb=433552, bsz=16655.3, num_updates=23500, lr=0.000412568, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=23242 epoch 014: 1580 / 1689 loss=4.111, nll_loss=2.488, ppl=5.61, wps=463724, ups=1.07, wpb=433552, bsz=16655.3, num_updates=23500, lr=0.000412568, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=23242 epoch 014: 1580 / 1689 loss=4.111, nll_loss=2.488, ppl=5.61, wps=463724, ups=1.07, wpb=433552, bsz=16655.3, num_updates=23500, lr=0.000412568, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=23242 epoch 014: 1580 / 1689 loss=4.111, nll_loss=2.488, ppl=5.61, wps=463724, ups=1.07, wpb=433552, bsz=16655.3, num_updates=23500, lr=0.000412568, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=23242 epoch 014: 1580 / 1689 loss=4.111, nll_loss=2.488, ppl=5.61, wps=463724, ups=1.07, wpb=433552, bsz=16655.3, num_updates=23500, lr=0.000412568, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=23242 epoch 014: 1580 / 1689 loss=4.111, nll_loss=2.488, ppl=5.61, wps=463724, ups=1.07, wpb=433552, bsz=16655.3, num_updates=23500, lr=0.000412568, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=23242 epoch 014: 1580 / 1689 loss=4.111, nll_loss=2.488, ppl=5.61, wps=463724, ups=1.07, wpb=433552, bsz=16655.3, num_updates=23500, lr=0.000412568, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=23242 epoch 014: 1580 / 1689 loss=4.111, nll_loss=2.488, ppl=5.61, wps=463724, ups=1.07, wpb=433552, bsz=16655.3, num_updates=23500, lr=0.000412568, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=23242 epoch 014: 1580 / 1689 loss=4.111, nll_loss=2.488, ppl=5.61, wps=463724, ups=1.07, wpb=433552, bsz=16655.3, num_updates=23500, lr=0.000412568, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=23242 epoch 014: 1580 / 1689 loss=4.111, nll_loss=2.488, ppl=5.61, wps=463724, ups=1.07, wpb=433552, bsz=16655.3, num_updates=23500, lr=0.000412568, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=23242 epoch 014: 1580 / 1689 loss=4.111, nll_loss=2.488, ppl=5.61, wps=463724, ups=1.07, wpb=433552, bsz=16655.3, num_updates=23500, lr=0.000412568, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=23242 epoch 014: 1580 / 1689 loss=4.111, nll_loss=2.488, ppl=5.61, wps=463724, ups=1.07, wpb=433552, bsz=16655.3, num_updates=23500, lr=0.000412568, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=23242 epoch 014: 1580 / 1689 loss=4.111, nll_loss=2.488, ppl=5.61, wps=463724, ups=1.07, wpb=433552, bsz=16655.3, num_updates=23500, lr=0.000412568, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=23242 epoch 014: 1580 / 1689 loss=4.111, nll_loss=2.488, ppl=5.61, wps=463724, ups=1.07, wpb=433552, bsz=16655.3, num_updates=23500, lr=0.000412568, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.3, wall=23242 epoch 014: 1680 / 1689 loss=4.096, nll_loss=2.47, ppl=5.54, wps=461399, ups=1.07, wpb=432308, bsz=16450.5, num_updates=23600, lr=0.000411693, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=23335 epoch 014: 1680 / 1689 loss=4.096, nll_loss=2.47, ppl=5.54, wps=461399, ups=1.07, wpb=432308, bsz=16450.5, num_updates=23600, lr=0.000411693, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=23335 epoch 014: 1680 / 1689 loss=4.096, nll_loss=2.47, ppl=5.54, wps=461399, ups=1.07, wpb=432308, bsz=16450.5, num_updates=23600, lr=0.000411693, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=23335 epoch 014: 1680 / 1689 loss=4.096, nll_loss=2.47, ppl=5.54, wps=461399, ups=1.07, wpb=432308, bsz=16450.5, num_updates=23600, lr=0.000411693, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=23335 epoch 014: 1680 / 1689 loss=4.096, nll_loss=2.47, ppl=5.54, wps=461399, ups=1.07, wpb=432308, bsz=16450.5, num_updates=23600, lr=0.000411693, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=23335 epoch 014: 1680 / 1689 loss=4.096, nll_loss=2.47, ppl=5.54, wps=461399, ups=1.07, wpb=432308, bsz=16450.5, num_updates=23600, lr=0.000411693, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=23335 epoch 014: 1680 / 1689 loss=4.096, nll_loss=2.47, ppl=5.54, wps=461399, ups=1.07, wpb=432308, bsz=16450.5, num_updates=23600, lr=0.000411693, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=23335 epoch 014: 1680 / 1689 loss=4.096, nll_loss=2.47, ppl=5.54, wps=461399, ups=1.07, wpb=432308, bsz=16450.5, num_updates=23600, lr=0.000411693, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=23335 epoch 014: 1680 / 1689 loss=4.096, nll_loss=2.47, ppl=5.54, wps=461399, ups=1.07, wpb=432308, bsz=16450.5, num_updates=23600, lr=0.000411693, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=23335 epoch 014: 1680 / 1689 loss=4.096, nll_loss=2.47, ppl=5.54, wps=461399, ups=1.07, wpb=432308, bsz=16450.5, num_updates=23600, lr=0.000411693, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=23335 epoch 014: 1680 / 1689 loss=4.096, nll_loss=2.47, ppl=5.54, wps=461399, ups=1.07, wpb=432308, bsz=16450.5, num_updates=23600, lr=0.000411693, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=23335 epoch 014: 1680 / 1689 loss=4.096, nll_loss=2.47, ppl=5.54, wps=461399, ups=1.07, wpb=432308, bsz=16450.5, num_updates=23600, lr=0.000411693, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=23335 epoch 014: 1680 / 1689 loss=4.096, nll_loss=2.47, ppl=5.54, wps=461399, ups=1.07, wpb=432308, bsz=16450.5, num_updates=23600, lr=0.000411693, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=23335 epoch 014: 1680 / 1689 loss=4.096, nll_loss=2.47, ppl=5.54, wps=461399, ups=1.07, wpb=432308, bsz=16450.5, num_updates=23600, lr=0.000411693, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=23335 end of epoch 14 (average epoch stats below) epoch 014 | loss 4.089 | nll_loss 2.462 | ppl 5.51 | wps 450006 | ups 1.04 | wpb 433524 | bsz 16506.9 | num_updates 23609 | lr 0.000411615 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1564 | gb_free 21.7 | wall 23343 epoch 014 | loss 4.089 | nll_loss 2.462 | ppl 5.51 | wps 450006 | ups 1.04 | wpb 433524 | bsz 16506.9 | num_updates 23609 | lr 0.000411615 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1564 | gb_free 21.7 | wall 23343 epoch 014 | loss 4.089 | nll_loss 2.462 | ppl 5.51 | wps 450006 | ups 1.04 | wpb 433524 | bsz 16506.9 | num_updates 23609 | lr 0.000411615 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1564 | gb_free 21.7 | wall 23343 epoch 014 | loss 4.089 | nll_loss 2.462 | ppl 5.51 | wps 450006 | ups 1.04 | wpb 433524 | bsz 16506.9 | num_updates 23609 | lr 0.000411615 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1564 | gb_free 21.7 | wall 23343 epoch 014 | loss 4.089 | nll_loss 2.462 | ppl 5.51 | wps 450006 | ups 1.04 | wpb 433524 | bsz 16506.9 | num_updates 23609 | lr 0.000411615 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1564 | gb_free 21.7 | wall 23343 epoch 014 | loss 4.089 | nll_loss 2.462 | ppl 5.51 | wps 450006 | ups 1.04 | wpb 433524 | bsz 16506.9 | num_updates 23609 | lr 0.000411615 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1564 | gb_free 21.7 | wall 23343 epoch 014 | loss 4.089 | nll_loss 2.462 | ppl 5.51 | wps 450006 | ups 1.04 | wpb 433524 | bsz 16506.9 | num_updates 23609 | lr 0.000411615 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1564 | gb_free 21.7 | wall 23343 epoch 014 | loss 4.089 | nll_loss 2.462 | ppl 5.51 | wps 450006 | ups 1.04 | wpb 433524 | bsz 16506.9 | num_updates 23609 | lr 0.000411615 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1564 | gb_free 21.7 | wall 23343 epoch 014 | loss 4.089 | nll_loss 2.462 | ppl 5.51 | wps 450006 | ups 1.04 | wpb 433524 | bsz 16506.9 | num_updates 23609 | lr 0.000411615 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1564 | gb_free 21.7 | wall 23343 epoch 014 | loss 4.089 | nll_loss 2.462 | ppl 5.51 | wps 450006 | ups 1.04 | wpb 433524 | bsz 16506.9 | num_updates 23609 | lr 0.000411615 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1564 | gb_free 21.7 | wall 23343 epoch 014 | loss 4.089 | nll_loss 2.462 | ppl 5.51 | wps 450006 | ups 1.04 | wpb 433524 | bsz 16506.9 | num_updates 23609 | lr 0.000411615 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1564 | gb_free 21.7 | wall 23343 epoch 014 | loss 4.089 | nll_loss 2.462 | ppl 5.51 | wps 450006 | ups 1.04 | wpb 433524 | bsz 16506.9 | num_updates 23609 | lr 0.000411615 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1564 | gb_free 21.7 | wall 23343 epoch 014 | loss 4.089 | nll_loss 2.462 | ppl 5.51 | wps 450006 | ups 1.04 | wpb 433524 | bsz 16506.9 | num_updates 23609 | lr 0.000411615 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1564 | gb_free 21.7 | wall 23343 epoch 014 | loss 4.089 | nll_loss 2.462 | ppl 5.51 | wps 450006 | ups 1.04 | wpb 433524 | bsz 16506.9 | num_updates 23609 | lr 0.000411615 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1564 | gb_free 21.7 | wall 23343 Start iterating over samples epoch 015: 91 / 1689 loss=4.044, nll_loss=2.41, ppl=5.31, wps=458841, ups=1.07, wpb=430541, bsz=16180.3, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=23429 epoch 015: 91 / 1689 loss=4.044, nll_loss=2.41, ppl=5.31, wps=458841, ups=1.07, wpb=430541, bsz=16180.3, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=23429 epoch 015: 91 / 1689 loss=4.044, nll_loss=2.41, ppl=5.31, wps=458841, ups=1.07, wpb=430541, bsz=16180.3, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=23429 epoch 015: 91 / 1689 loss=4.044, nll_loss=2.41, ppl=5.31, wps=458841, ups=1.07, wpb=430541, bsz=16180.3, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=23429 epoch 015: 91 / 1689 loss=4.044, nll_loss=2.41, ppl=5.31, wps=458841, ups=1.07, wpb=430541, bsz=16180.3, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=23429 epoch 015: 91 / 1689 loss=4.044, nll_loss=2.41, ppl=5.31, wps=458841, ups=1.07, wpb=430541, bsz=16180.3, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=23429 epoch 015: 91 / 1689 loss=4.044, nll_loss=2.41, ppl=5.31, wps=458841, ups=1.07, wpb=430541, bsz=16180.3, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=23429 epoch 015: 91 / 1689 loss=4.044, nll_loss=2.41, ppl=5.31, wps=458841, ups=1.07, wpb=430541, bsz=16180.3, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=23429 epoch 015: 91 / 1689 loss=4.044, nll_loss=2.41, ppl=5.31, wps=458841, ups=1.07, wpb=430541, bsz=16180.3, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=23429 epoch 015: 91 / 1689 loss=4.044, nll_loss=2.41, ppl=5.31, wps=458841, ups=1.07, wpb=430541, bsz=16180.3, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=23429 epoch 015: 91 / 1689 loss=4.044, nll_loss=2.41, ppl=5.31, wps=458841, ups=1.07, wpb=430541, bsz=16180.3, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=23429 epoch 015: 91 / 1689 loss=4.044, nll_loss=2.41, ppl=5.31, wps=458841, ups=1.07, wpb=430541, bsz=16180.3, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=23429 epoch 015: 91 / 1689 loss=4.044, nll_loss=2.41, ppl=5.31, wps=458841, ups=1.07, wpb=430541, bsz=16180.3, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=23429 epoch 015: 91 / 1689 loss=4.044, nll_loss=2.41, ppl=5.31, wps=458841, ups=1.07, wpb=430541, bsz=16180.3, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=23429 epoch 015: 91 / 1689 loss=4.044, nll_loss=2.41, ppl=5.31, wps=458841, ups=1.07, wpb=430541, bsz=16180.3, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=23429 epoch 015: 191 / 1689 loss=4.053, nll_loss=2.42, ppl=5.35, wps=456890, ups=1.05, wpb=434453, bsz=16475.4, num_updates=23800, lr=0.00040996, gnorm=0.218, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=23524 epoch 015: 191 / 1689 loss=4.053, nll_loss=2.42, ppl=5.35, wps=456890, ups=1.05, wpb=434453, bsz=16475.4, num_updates=23800, lr=0.00040996, gnorm=0.218, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=23524 epoch 015: 191 / 1689 loss=4.053, nll_loss=2.42, ppl=5.35, wps=456890, ups=1.05, wpb=434453, bsz=16475.4, num_updates=23800, lr=0.00040996, gnorm=0.218, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=23524 epoch 015: 191 / 1689 loss=4.053, nll_loss=2.42, ppl=5.35, wps=456890, ups=1.05, wpb=434453, bsz=16475.4, num_updates=23800, lr=0.00040996, gnorm=0.218, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=23524 epoch 015: 191 / 1689 loss=4.053, nll_loss=2.42, ppl=5.35, wps=456890, ups=1.05, wpb=434453, bsz=16475.4, num_updates=23800, lr=0.00040996, gnorm=0.218, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=23524 epoch 015: 191 / 1689 loss=4.053, nll_loss=2.42, ppl=5.35, wps=456890, ups=1.05, wpb=434453, bsz=16475.4, num_updates=23800, lr=0.00040996, gnorm=0.218, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=23524 epoch 015: 191 / 1689 loss=4.053, nll_loss=2.42, ppl=5.35, wps=456890, ups=1.05, wpb=434453, bsz=16475.4, num_updates=23800, lr=0.00040996, gnorm=0.218, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=23524 epoch 015: 191 / 1689 loss=4.053, nll_loss=2.42, ppl=5.35, wps=456890, ups=1.05, wpb=434453, bsz=16475.4, num_updates=23800, lr=0.00040996, gnorm=0.218, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=23524 epoch 015: 191 / 1689 loss=4.053, nll_loss=2.42, ppl=5.35, wps=456890, ups=1.05, wpb=434453, bsz=16475.4, num_updates=23800, lr=0.00040996, gnorm=0.218, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=23524 epoch 015: 191 / 1689 loss=4.053, nll_loss=2.42, ppl=5.35, wps=456890, ups=1.05, wpb=434453, bsz=16475.4, num_updates=23800, lr=0.00040996, gnorm=0.218, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=23524 epoch 015: 191 / 1689 loss=4.053, nll_loss=2.42, ppl=5.35, wps=456890, ups=1.05, wpb=434453, bsz=16475.4, num_updates=23800, lr=0.00040996, gnorm=0.218, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=23524 epoch 015: 191 / 1689 loss=4.053, nll_loss=2.42, ppl=5.35, wps=456890, ups=1.05, wpb=434453, bsz=16475.4, num_updates=23800, lr=0.00040996, gnorm=0.218, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=23524 epoch 015: 191 / 1689 loss=4.053, nll_loss=2.42, ppl=5.35, wps=456890, ups=1.05, wpb=434453, bsz=16475.4, num_updates=23800, lr=0.00040996, gnorm=0.218, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=23524 epoch 015: 191 / 1689 loss=4.053, nll_loss=2.42, ppl=5.35, wps=456890, ups=1.05, wpb=434453, bsz=16475.4, num_updates=23800, lr=0.00040996, gnorm=0.218, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=23524 epoch 015: 191 / 1689 loss=4.053, nll_loss=2.42, ppl=5.35, wps=456890, ups=1.05, wpb=434453, bsz=16475.4, num_updates=23800, lr=0.00040996, gnorm=0.218, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=23524 epoch 015: 291 / 1689 loss=4.062, nll_loss=2.431, ppl=5.39, wps=458867, ups=1.06, wpb=434148, bsz=16533.4, num_updates=23900, lr=0.000409101, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=23619 epoch 015: 291 / 1689 loss=4.062, nll_loss=2.431, ppl=5.39, wps=458867, ups=1.06, wpb=434148, bsz=16533.4, num_updates=23900, lr=0.000409101, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=23619 epoch 015: 291 / 1689 loss=4.062, nll_loss=2.431, ppl=5.39, wps=458867, ups=1.06, wpb=434148, bsz=16533.4, num_updates=23900, lr=0.000409101, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=23619 epoch 015: 291 / 1689 loss=4.062, nll_loss=2.431, ppl=5.39, wps=458867, ups=1.06, wpb=434148, bsz=16533.4, num_updates=23900, lr=0.000409101, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=23619 epoch 015: 291 / 1689 loss=4.062, nll_loss=2.431, ppl=5.39, wps=458867, ups=1.06, wpb=434148, bsz=16533.4, num_updates=23900, lr=0.000409101, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=23619 epoch 015: 291 / 1689 loss=4.062, nll_loss=2.431, ppl=5.39, wps=458867, ups=1.06, wpb=434148, bsz=16533.4, num_updates=23900, lr=0.000409101, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=23619 epoch 015: 291 / 1689 loss=4.062, nll_loss=2.431, ppl=5.39, wps=458867, ups=1.06, wpb=434148, bsz=16533.4, num_updates=23900, lr=0.000409101, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=23619 epoch 015: 291 / 1689 loss=4.062, nll_loss=2.431, ppl=5.39, wps=458867, ups=1.06, wpb=434148, bsz=16533.4, num_updates=23900, lr=0.000409101, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=23619 epoch 015: 291 / 1689 loss=4.062, nll_loss=2.431, ppl=5.39, wps=458867, ups=1.06, wpb=434148, bsz=16533.4, num_updates=23900, lr=0.000409101, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=23619 epoch 015: 291 / 1689 loss=4.062, nll_loss=2.431, ppl=5.39, wps=458867, ups=1.06, wpb=434148, bsz=16533.4, num_updates=23900, lr=0.000409101, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=23619 epoch 015: 291 / 1689 loss=4.062, nll_loss=2.431, ppl=5.39, wps=458867, ups=1.06, wpb=434148, bsz=16533.4, num_updates=23900, lr=0.000409101, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=23619 epoch 015: 291 / 1689 loss=4.062, nll_loss=2.431, ppl=5.39, wps=458867, ups=1.06, wpb=434148, bsz=16533.4, num_updates=23900, lr=0.000409101, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=23619 epoch 015: 291 / 1689 loss=4.062, nll_loss=2.431, ppl=5.39, wps=458867, ups=1.06, wpb=434148, bsz=16533.4, num_updates=23900, lr=0.000409101, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=23619 epoch 015: 291 / 1689 loss=4.062, nll_loss=2.431, ppl=5.39, wps=458867, ups=1.06, wpb=434148, bsz=16533.4, num_updates=23900, lr=0.000409101, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=23619 epoch 015: 291 / 1689 loss=4.062, nll_loss=2.431, ppl=5.39, wps=458867, ups=1.06, wpb=434148, bsz=16533.4, num_updates=23900, lr=0.000409101, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=17.5, wall=23619 epoch 015: 391 / 1689 loss=4.078, nll_loss=2.449, ppl=5.46, wps=458638, ups=1.06, wpb=434465, bsz=16846.2, num_updates=24000, lr=0.000408248, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=23714 epoch 015: 391 / 1689 loss=4.078, nll_loss=2.449, ppl=5.46, wps=458638, ups=1.06, wpb=434465, bsz=16846.2, num_updates=24000, lr=0.000408248, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=23714 epoch 015: 391 / 1689 loss=4.078, nll_loss=2.449, ppl=5.46, wps=458638, ups=1.06, wpb=434465, bsz=16846.2, num_updates=24000, lr=0.000408248, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=23714 epoch 015: 391 / 1689 loss=4.078, nll_loss=2.449, ppl=5.46, wps=458638, ups=1.06, wpb=434465, bsz=16846.2, num_updates=24000, lr=0.000408248, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=23714 epoch 015: 391 / 1689 loss=4.078, nll_loss=2.449, ppl=5.46, wps=458638, ups=1.06, wpb=434465, bsz=16846.2, num_updates=24000, lr=0.000408248, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=23714 epoch 015: 391 / 1689 loss=4.078, nll_loss=2.449, ppl=5.46, wps=458638, ups=1.06, wpb=434465, bsz=16846.2, num_updates=24000, lr=0.000408248, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=23714 epoch 015: 391 / 1689 loss=4.078, nll_loss=2.449, ppl=5.46, wps=458638, ups=1.06, wpb=434465, bsz=16846.2, num_updates=24000, lr=0.000408248, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=23714 epoch 015: 391 / 1689 loss=4.078, nll_loss=2.449, ppl=5.46, wps=458638, ups=1.06, wpb=434465, bsz=16846.2, num_updates=24000, lr=0.000408248, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=23714 epoch 015: 391 / 1689 loss=4.078, nll_loss=2.449, ppl=5.46, wps=458638, ups=1.06, wpb=434465, bsz=16846.2, num_updates=24000, lr=0.000408248, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=23714 epoch 015: 391 / 1689 loss=4.078, nll_loss=2.449, ppl=5.46, wps=458638, ups=1.06, wpb=434465, bsz=16846.2, num_updates=24000, lr=0.000408248, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=23714 epoch 015: 391 / 1689 loss=4.078, nll_loss=2.449, ppl=5.46, wps=458638, ups=1.06, wpb=434465, bsz=16846.2, num_updates=24000, lr=0.000408248, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=23714 epoch 015: 391 / 1689 loss=4.078, nll_loss=2.449, ppl=5.46, wps=458638, ups=1.06, wpb=434465, bsz=16846.2, num_updates=24000, lr=0.000408248, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=23714 epoch 015: 391 / 1689 loss=4.078, nll_loss=2.449, ppl=5.46, wps=458638, ups=1.06, wpb=434465, bsz=16846.2, num_updates=24000, lr=0.000408248, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=23714 epoch 015: 391 / 1689 loss=4.078, nll_loss=2.449, ppl=5.46, wps=458638, ups=1.06, wpb=434465, bsz=16846.2, num_updates=24000, lr=0.000408248, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=23714 epoch 015: 391 / 1689 loss=4.078, nll_loss=2.449, ppl=5.46, wps=458638, ups=1.06, wpb=434465, bsz=16846.2, num_updates=24000, lr=0.000408248, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.2, wall=23714 begin validation on "valid" subset epoch 015 | valid on 'valid' subset | loss 4.289 | nll_loss 2.672 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.271 epoch 015 | valid on 'valid' subset | loss 4.289 | nll_loss 2.672 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.271 epoch 015 | valid on 'valid' subset | loss 4.289 | nll_loss 2.672 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.271 epoch 015 | valid on 'valid' subset | loss 4.289 | nll_loss 2.672 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.271 epoch 015 | valid on 'valid' subset | loss 4.289 | nll_loss 2.672 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.271 epoch 015 | valid on 'valid' subset | loss 4.289 | nll_loss 2.672 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.271 epoch 015 | valid on 'valid' subset | loss 4.289 | nll_loss 2.672 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.271 epoch 015 | valid on 'valid' subset | loss 4.289 | nll_loss 2.672 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.271 epoch 015 | valid on 'valid' subset | loss 4.289 | nll_loss 2.672 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.271 epoch 015 | valid on 'valid' subset | loss 4.289 | nll_loss 2.672 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.271 epoch 015 | valid on 'valid' subset | loss 4.289 | nll_loss 2.672 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.271 epoch 015 | valid on 'valid' subset | loss 4.289 | nll_loss 2.672 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.271 epoch 015 | valid on 'valid' subset | loss 4.289 | nll_loss 2.672 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.271 epoch 015 | valid on 'valid' subset | loss 4.289 | nll_loss 2.672 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.271 epoch 015 | valid on 'valid' subset | loss 4.289 | nll_loss 2.672 | ppl 6.37 | wps 0 | wpb 42662 | bsz 2032 | num_updates 24000 | best_loss 4.271 epoch 015: 491 / 1689 loss=4.073, nll_loss=2.444, ppl=5.44, wps=400506, ups=0.92, wpb=435414, bsz=16726.7, num_updates=24100, lr=0.0004074, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=23822 epoch 015: 491 / 1689 loss=4.073, nll_loss=2.444, ppl=5.44, wps=400506, ups=0.92, wpb=435414, bsz=16726.7, num_updates=24100, lr=0.0004074, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=23822 epoch 015: 491 / 1689 loss=4.073, nll_loss=2.444, ppl=5.44, wps=400506, ups=0.92, wpb=435414, bsz=16726.7, num_updates=24100, lr=0.0004074, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=23822 epoch 015: 491 / 1689 loss=4.073, nll_loss=2.444, ppl=5.44, wps=400506, ups=0.92, wpb=435414, bsz=16726.7, num_updates=24100, lr=0.0004074, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=23822 epoch 015: 491 / 1689 loss=4.073, nll_loss=2.444, ppl=5.44, wps=400506, ups=0.92, wpb=435414, bsz=16726.7, num_updates=24100, lr=0.0004074, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=23822 epoch 015: 491 / 1689 loss=4.073, nll_loss=2.444, ppl=5.44, wps=400506, ups=0.92, wpb=435414, bsz=16726.7, num_updates=24100, lr=0.0004074, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=23822 epoch 015: 491 / 1689 loss=4.073, nll_loss=2.444, ppl=5.44, wps=400506, ups=0.92, wpb=435414, bsz=16726.7, num_updates=24100, lr=0.0004074, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=23822 epoch 015: 491 / 1689 loss=4.073, nll_loss=2.444, ppl=5.44, wps=400506, ups=0.92, wpb=435414, bsz=16726.7, num_updates=24100, lr=0.0004074, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=23822 epoch 015: 491 / 1689 loss=4.073, nll_loss=2.444, ppl=5.44, wps=400506, ups=0.92, wpb=435414, bsz=16726.7, num_updates=24100, lr=0.0004074, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=23822 epoch 015: 491 / 1689 loss=4.073, nll_loss=2.444, ppl=5.44, wps=400506, ups=0.92, wpb=435414, bsz=16726.7, num_updates=24100, lr=0.0004074, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=23822 epoch 015: 491 / 1689 loss=4.073, nll_loss=2.444, ppl=5.44, wps=400506, ups=0.92, wpb=435414, bsz=16726.7, num_updates=24100, lr=0.0004074, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=23822 epoch 015: 491 / 1689 loss=4.073, nll_loss=2.444, ppl=5.44, wps=400506, ups=0.92, wpb=435414, bsz=16726.7, num_updates=24100, lr=0.0004074, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=23822 epoch 015: 491 / 1689 loss=4.073, nll_loss=2.444, ppl=5.44, wps=400506, ups=0.92, wpb=435414, bsz=16726.7, num_updates=24100, lr=0.0004074, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=23822 epoch 015: 491 / 1689 loss=4.073, nll_loss=2.444, ppl=5.44, wps=400506, ups=0.92, wpb=435414, bsz=16726.7, num_updates=24100, lr=0.0004074, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=23822 epoch 015: 491 / 1689 loss=4.073, nll_loss=2.444, ppl=5.44, wps=400506, ups=0.92, wpb=435414, bsz=16726.7, num_updates=24100, lr=0.0004074, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=23822 epoch 015: 592 / 1689 loss=4.063, nll_loss=2.432, ppl=5.4, wps=453868, ups=1.05, wpb=431632, bsz=16435.4, num_updates=24200, lr=0.000406558, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23918 epoch 015: 592 / 1689 loss=4.063, nll_loss=2.432, ppl=5.4, wps=453868, ups=1.05, wpb=431632, bsz=16435.4, num_updates=24200, lr=0.000406558, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23918 epoch 015: 592 / 1689 loss=4.063, nll_loss=2.432, ppl=5.4, wps=453868, ups=1.05, wpb=431632, bsz=16435.4, num_updates=24200, lr=0.000406558, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23918 epoch 015: 592 / 1689 loss=4.063, nll_loss=2.432, ppl=5.4, wps=453868, ups=1.05, wpb=431632, bsz=16435.4, num_updates=24200, lr=0.000406558, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23918 epoch 015: 592 / 1689 loss=4.063, nll_loss=2.432, ppl=5.4, wps=453868, ups=1.05, wpb=431632, bsz=16435.4, num_updates=24200, lr=0.000406558, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23918 epoch 015: 592 / 1689 loss=4.063, nll_loss=2.432, ppl=5.4, wps=453868, ups=1.05, wpb=431632, bsz=16435.4, num_updates=24200, lr=0.000406558, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23918 epoch 015: 592 / 1689 loss=4.063, nll_loss=2.432, ppl=5.4, wps=453868, ups=1.05, wpb=431632, bsz=16435.4, num_updates=24200, lr=0.000406558, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23918 epoch 015: 592 / 1689 loss=4.063, nll_loss=2.432, ppl=5.4, wps=453868, ups=1.05, wpb=431632, bsz=16435.4, num_updates=24200, lr=0.000406558, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23918 epoch 015: 592 / 1689 loss=4.063, nll_loss=2.432, ppl=5.4, wps=453868, ups=1.05, wpb=431632, bsz=16435.4, num_updates=24200, lr=0.000406558, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23918 epoch 015: 592 / 1689 loss=4.063, nll_loss=2.432, ppl=5.4, wps=453868, ups=1.05, wpb=431632, bsz=16435.4, num_updates=24200, lr=0.000406558, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23918 epoch 015: 592 / 1689 loss=4.063, nll_loss=2.432, ppl=5.4, wps=453868, ups=1.05, wpb=431632, bsz=16435.4, num_updates=24200, lr=0.000406558, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23918 epoch 015: 592 / 1689 loss=4.063, nll_loss=2.432, ppl=5.4, wps=453868, ups=1.05, wpb=431632, bsz=16435.4, num_updates=24200, lr=0.000406558, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23918 epoch 015: 592 / 1689 loss=4.063, nll_loss=2.432, ppl=5.4, wps=453868, ups=1.05, wpb=431632, bsz=16435.4, num_updates=24200, lr=0.000406558, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23918 epoch 015: 592 / 1689 loss=4.063, nll_loss=2.432, ppl=5.4, wps=453868, ups=1.05, wpb=431632, bsz=16435.4, num_updates=24200, lr=0.000406558, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23918 epoch 015: 592 / 1689 loss=4.063, nll_loss=2.432, ppl=5.4, wps=453868, ups=1.05, wpb=431632, bsz=16435.4, num_updates=24200, lr=0.000406558, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=23918 epoch 015: 692 / 1689 loss=4.071, nll_loss=2.442, ppl=5.43, wps=458426, ups=1.06, wpb=434014, bsz=16667.5, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=24012 epoch 015: 692 / 1689 loss=4.071, nll_loss=2.442, ppl=5.43, wps=458426, ups=1.06, wpb=434014, bsz=16667.5, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=24012 epoch 015: 692 / 1689 loss=4.071, nll_loss=2.442, ppl=5.43, wps=458426, ups=1.06, wpb=434014, bsz=16667.5, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=24012 epoch 015: 692 / 1689 loss=4.071, nll_loss=2.442, ppl=5.43, wps=458426, ups=1.06, wpb=434014, bsz=16667.5, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=24012 epoch 015: 692 / 1689 loss=4.071, nll_loss=2.442, ppl=5.43, wps=458426, ups=1.06, wpb=434014, bsz=16667.5, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=24012 epoch 015: 692 / 1689 loss=4.071, nll_loss=2.442, ppl=5.43, wps=458426, ups=1.06, wpb=434014, bsz=16667.5, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=24012 epoch 015: 692 / 1689 loss=4.071, nll_loss=2.442, ppl=5.43, wps=458426, ups=1.06, wpb=434014, bsz=16667.5, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=24012 epoch 015: 692 / 1689 loss=4.071, nll_loss=2.442, ppl=5.43, wps=458426, ups=1.06, wpb=434014, bsz=16667.5, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=24012 epoch 015: 692 / 1689 loss=4.071, nll_loss=2.442, ppl=5.43, wps=458426, ups=1.06, wpb=434014, bsz=16667.5, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=24012 epoch 015: 692 / 1689 loss=4.071, nll_loss=2.442, ppl=5.43, wps=458426, ups=1.06, wpb=434014, bsz=16667.5, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=24012 epoch 015: 692 / 1689 loss=4.071, nll_loss=2.442, ppl=5.43, wps=458426, ups=1.06, wpb=434014, bsz=16667.5, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=24012 epoch 015: 692 / 1689 loss=4.071, nll_loss=2.442, ppl=5.43, wps=458426, ups=1.06, wpb=434014, bsz=16667.5, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=24012 epoch 015: 692 / 1689 loss=4.071, nll_loss=2.442, ppl=5.43, wps=458426, ups=1.06, wpb=434014, bsz=16667.5, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=24012 epoch 015: 692 / 1689 loss=4.071, nll_loss=2.442, ppl=5.43, wps=458426, ups=1.06, wpb=434014, bsz=16667.5, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=24012 epoch 015: 692 / 1689 loss=4.071, nll_loss=2.442, ppl=5.43, wps=458426, ups=1.06, wpb=434014, bsz=16667.5, num_updates=24300, lr=0.00040572, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=24012 epoch 015: 792 / 1689 loss=4.079, nll_loss=2.45, ppl=5.47, wps=460426, ups=1.07, wpb=431292, bsz=16241.4, num_updates=24400, lr=0.000404888, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=24106 epoch 015: 792 / 1689 loss=4.079, nll_loss=2.45, ppl=5.47, wps=460426, ups=1.07, wpb=431292, bsz=16241.4, num_updates=24400, lr=0.000404888, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=24106 epoch 015: 792 / 1689 loss=4.079, nll_loss=2.45, ppl=5.47, wps=460426, ups=1.07, wpb=431292, bsz=16241.4, num_updates=24400, lr=0.000404888, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=24106 epoch 015: 792 / 1689 loss=4.079, nll_loss=2.45, ppl=5.47, wps=460426, ups=1.07, wpb=431292, bsz=16241.4, num_updates=24400, lr=0.000404888, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=24106 epoch 015: 792 / 1689 loss=4.079, nll_loss=2.45, ppl=5.47, wps=460426, ups=1.07, wpb=431292, bsz=16241.4, num_updates=24400, lr=0.000404888, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=24106 epoch 015: 792 / 1689 loss=4.079, nll_loss=2.45, ppl=5.47, wps=460426, ups=1.07, wpb=431292, bsz=16241.4, num_updates=24400, lr=0.000404888, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=24106 epoch 015: 792 / 1689 loss=4.079, nll_loss=2.45, ppl=5.47, wps=460426, ups=1.07, wpb=431292, bsz=16241.4, num_updates=24400, lr=0.000404888, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=24106 epoch 015: 792 / 1689 loss=4.079, nll_loss=2.45, ppl=5.47, wps=460426, ups=1.07, wpb=431292, bsz=16241.4, num_updates=24400, lr=0.000404888, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=24106 epoch 015: 792 / 1689 loss=4.079, nll_loss=2.45, ppl=5.47, wps=460426, ups=1.07, wpb=431292, bsz=16241.4, num_updates=24400, lr=0.000404888, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=24106 epoch 015: 792 / 1689 loss=4.079, nll_loss=2.45, ppl=5.47, wps=460426, ups=1.07, wpb=431292, bsz=16241.4, num_updates=24400, lr=0.000404888, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=24106 epoch 015: 792 / 1689 loss=4.079, nll_loss=2.45, ppl=5.47, wps=460426, ups=1.07, wpb=431292, bsz=16241.4, num_updates=24400, lr=0.000404888, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=24106 epoch 015: 792 / 1689 loss=4.079, nll_loss=2.45, ppl=5.47, wps=460426, ups=1.07, wpb=431292, bsz=16241.4, num_updates=24400, lr=0.000404888, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=24106 epoch 015: 792 / 1689 loss=4.079, nll_loss=2.45, ppl=5.47, wps=460426, ups=1.07, wpb=431292, bsz=16241.4, num_updates=24400, lr=0.000404888, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=24106 epoch 015: 792 / 1689 loss=4.079, nll_loss=2.45, ppl=5.47, wps=460426, ups=1.07, wpb=431292, bsz=16241.4, num_updates=24400, lr=0.000404888, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=24106 epoch 015: 792 / 1689 loss=4.079, nll_loss=2.45, ppl=5.47, wps=460426, ups=1.07, wpb=431292, bsz=16241.4, num_updates=24400, lr=0.000404888, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=24106 epoch 015: 892 / 1689 loss=4.085, nll_loss=2.458, ppl=5.49, wps=457136, ups=1.05, wpb=435089, bsz=16273, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=24201 epoch 015: 892 / 1689 loss=4.085, nll_loss=2.458, ppl=5.49, wps=457136, ups=1.05, wpb=435089, bsz=16273, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=24201 epoch 015: 892 / 1689 loss=4.085, nll_loss=2.458, ppl=5.49, wps=457136, ups=1.05, wpb=435089, bsz=16273, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=24201 epoch 015: 892 / 1689 loss=4.085, nll_loss=2.458, ppl=5.49, wps=457136, ups=1.05, wpb=435089, bsz=16273, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=24201 epoch 015: 892 / 1689 loss=4.085, nll_loss=2.458, ppl=5.49, wps=457136, ups=1.05, wpb=435089, bsz=16273, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=24201 epoch 015: 892 / 1689 loss=4.085, nll_loss=2.458, ppl=5.49, wps=457136, ups=1.05, wpb=435089, bsz=16273, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=24201 epoch 015: 892 / 1689 loss=4.085, nll_loss=2.458, ppl=5.49, wps=457136, ups=1.05, wpb=435089, bsz=16273, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=24201 epoch 015: 892 / 1689 loss=4.085, nll_loss=2.458, ppl=5.49, wps=457136, ups=1.05, wpb=435089, bsz=16273, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=24201 epoch 015: 892 / 1689 loss=4.085, nll_loss=2.458, ppl=5.49, wps=457136, ups=1.05, wpb=435089, bsz=16273, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=24201 epoch 015: 892 / 1689 loss=4.085, nll_loss=2.458, ppl=5.49, wps=457136, ups=1.05, wpb=435089, bsz=16273, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=24201 epoch 015: 892 / 1689 loss=4.085, nll_loss=2.458, ppl=5.49, wps=457136, ups=1.05, wpb=435089, bsz=16273, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=24201 epoch 015: 892 / 1689 loss=4.085, nll_loss=2.458, ppl=5.49, wps=457136, ups=1.05, wpb=435089, bsz=16273, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=24201 epoch 015: 892 / 1689 loss=4.085, nll_loss=2.458, ppl=5.49, wps=457136, ups=1.05, wpb=435089, bsz=16273, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=24201 epoch 015: 892 / 1689 loss=4.085, nll_loss=2.458, ppl=5.49, wps=457136, ups=1.05, wpb=435089, bsz=16273, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=24201 epoch 015: 892 / 1689 loss=4.085, nll_loss=2.458, ppl=5.49, wps=457136, ups=1.05, wpb=435089, bsz=16273, num_updates=24500, lr=0.000404061, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=24201 epoch 015: 992 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=460021, ups=1.06, wpb=432753, bsz=16211.3, num_updates=24600, lr=0.000403239, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=24295 epoch 015: 992 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=460021, ups=1.06, wpb=432753, bsz=16211.3, num_updates=24600, lr=0.000403239, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=24295 epoch 015: 992 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=460021, ups=1.06, wpb=432753, bsz=16211.3, num_updates=24600, lr=0.000403239, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=24295 epoch 015: 992 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=460021, ups=1.06, wpb=432753, bsz=16211.3, num_updates=24600, lr=0.000403239, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=24295 epoch 015: 992 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=460021, ups=1.06, wpb=432753, bsz=16211.3, num_updates=24600, lr=0.000403239, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=24295 epoch 015: 992 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=460021, ups=1.06, wpb=432753, bsz=16211.3, num_updates=24600, lr=0.000403239, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=24295 epoch 015: 992 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=460021, ups=1.06, wpb=432753, bsz=16211.3, num_updates=24600, lr=0.000403239, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=24295 epoch 015: 992 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=460021, ups=1.06, wpb=432753, bsz=16211.3, num_updates=24600, lr=0.000403239, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=24295 epoch 015: 992 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=460021, ups=1.06, wpb=432753, bsz=16211.3, num_updates=24600, lr=0.000403239, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=24295 epoch 015: 992 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=460021, ups=1.06, wpb=432753, bsz=16211.3, num_updates=24600, lr=0.000403239, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=24295 epoch 015: 992 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=460021, ups=1.06, wpb=432753, bsz=16211.3, num_updates=24600, lr=0.000403239, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=24295 epoch 015: 992 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=460021, ups=1.06, wpb=432753, bsz=16211.3, num_updates=24600, lr=0.000403239, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=24295 epoch 015: 992 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=460021, ups=1.06, wpb=432753, bsz=16211.3, num_updates=24600, lr=0.000403239, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=24295 epoch 015: 992 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=460021, ups=1.06, wpb=432753, bsz=16211.3, num_updates=24600, lr=0.000403239, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=24295 epoch 015: 992 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=460021, ups=1.06, wpb=432753, bsz=16211.3, num_updates=24600, lr=0.000403239, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=24295 epoch 015: 1092 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=459723, ups=1.06, wpb=433278, bsz=16815.3, num_updates=24700, lr=0.000402422, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=24389 epoch 015: 1092 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=459723, ups=1.06, wpb=433278, bsz=16815.3, num_updates=24700, lr=0.000402422, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=24389 epoch 015: 1092 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=459723, ups=1.06, wpb=433278, bsz=16815.3, num_updates=24700, lr=0.000402422, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=24389 epoch 015: 1092 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=459723, ups=1.06, wpb=433278, bsz=16815.3, num_updates=24700, lr=0.000402422, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=24389 epoch 015: 1092 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=459723, ups=1.06, wpb=433278, bsz=16815.3, num_updates=24700, lr=0.000402422, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=24389 epoch 015: 1092 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=459723, ups=1.06, wpb=433278, bsz=16815.3, num_updates=24700, lr=0.000402422, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=24389 epoch 015: 1092 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=459723, ups=1.06, wpb=433278, bsz=16815.3, num_updates=24700, lr=0.000402422, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=24389 epoch 015: 1092 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=459723, ups=1.06, wpb=433278, bsz=16815.3, num_updates=24700, lr=0.000402422, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=24389 epoch 015: 1092 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=459723, ups=1.06, wpb=433278, bsz=16815.3, num_updates=24700, lr=0.000402422, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=24389 epoch 015: 1092 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=459723, ups=1.06, wpb=433278, bsz=16815.3, num_updates=24700, lr=0.000402422, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=24389 epoch 015: 1092 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=459723, ups=1.06, wpb=433278, bsz=16815.3, num_updates=24700, lr=0.000402422, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=24389 epoch 015: 1092 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=459723, ups=1.06, wpb=433278, bsz=16815.3, num_updates=24700, lr=0.000402422, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=24389 epoch 015: 1092 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=459723, ups=1.06, wpb=433278, bsz=16815.3, num_updates=24700, lr=0.000402422, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=24389 epoch 015: 1092 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=459723, ups=1.06, wpb=433278, bsz=16815.3, num_updates=24700, lr=0.000402422, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=24389 epoch 015: 1092 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=459723, ups=1.06, wpb=433278, bsz=16815.3, num_updates=24700, lr=0.000402422, gnorm=0.211, clip=0, loss_scale=1, train_wall=93, gb_free=17.4, wall=24389 epoch 015: 1192 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=463468, ups=1.07, wpb=434329, bsz=16263.6, num_updates=24800, lr=0.00040161, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24483 epoch 015: 1192 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=463468, ups=1.07, wpb=434329, bsz=16263.6, num_updates=24800, lr=0.00040161, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24483 epoch 015: 1192 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=463468, ups=1.07, wpb=434329, bsz=16263.6, num_updates=24800, lr=0.00040161, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24483 epoch 015: 1192 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=463468, ups=1.07, wpb=434329, bsz=16263.6, num_updates=24800, lr=0.00040161, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24483 epoch 015: 1192 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=463468, ups=1.07, wpb=434329, bsz=16263.6, num_updates=24800, lr=0.00040161, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24483 epoch 015: 1192 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=463468, ups=1.07, wpb=434329, bsz=16263.6, num_updates=24800, lr=0.00040161, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24483 epoch 015: 1192 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=463468, ups=1.07, wpb=434329, bsz=16263.6, num_updates=24800, lr=0.00040161, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24483 epoch 015: 1192 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=463468, ups=1.07, wpb=434329, bsz=16263.6, num_updates=24800, lr=0.00040161, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24483 epoch 015: 1192 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=463468, ups=1.07, wpb=434329, bsz=16263.6, num_updates=24800, lr=0.00040161, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24483 epoch 015: 1192 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=463468, ups=1.07, wpb=434329, bsz=16263.6, num_updates=24800, lr=0.00040161, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24483 epoch 015: 1192 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=463468, ups=1.07, wpb=434329, bsz=16263.6, num_updates=24800, lr=0.00040161, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24483 epoch 015: 1192 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=463468, ups=1.07, wpb=434329, bsz=16263.6, num_updates=24800, lr=0.00040161, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24483 epoch 015: 1192 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=463468, ups=1.07, wpb=434329, bsz=16263.6, num_updates=24800, lr=0.00040161, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24483 epoch 015: 1192 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=463468, ups=1.07, wpb=434329, bsz=16263.6, num_updates=24800, lr=0.00040161, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24483 epoch 015: 1192 / 1689 loss=4.089, nll_loss=2.463, ppl=5.51, wps=463468, ups=1.07, wpb=434329, bsz=16263.6, num_updates=24800, lr=0.00040161, gnorm=0.207, clip=0, loss_scale=2, train_wall=92, gb_free=19.3, wall=24483 epoch 015: 1292 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=453256, ups=1.05, wpb=431591, bsz=16883.4, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24578 epoch 015: 1292 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=453256, ups=1.05, wpb=431591, bsz=16883.4, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24578 epoch 015: 1292 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=453256, ups=1.05, wpb=431591, bsz=16883.4, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24578 epoch 015: 1292 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=453256, ups=1.05, wpb=431591, bsz=16883.4, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24578 epoch 015: 1292 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=453256, ups=1.05, wpb=431591, bsz=16883.4, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24578 epoch 015: 1292 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=453256, ups=1.05, wpb=431591, bsz=16883.4, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24578 epoch 015: 1292 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=453256, ups=1.05, wpb=431591, bsz=16883.4, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24578 epoch 015: 1292 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=453256, ups=1.05, wpb=431591, bsz=16883.4, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24578 epoch 015: 1292 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=453256, ups=1.05, wpb=431591, bsz=16883.4, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24578 epoch 015: 1292 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=453256, ups=1.05, wpb=431591, bsz=16883.4, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24578 epoch 015: 1292 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=453256, ups=1.05, wpb=431591, bsz=16883.4, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24578 epoch 015: 1292 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=453256, ups=1.05, wpb=431591, bsz=16883.4, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24578 epoch 015: 1292 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=453256, ups=1.05, wpb=431591, bsz=16883.4, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24578 epoch 015: 1292 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=453256, ups=1.05, wpb=431591, bsz=16883.4, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24578 epoch 015: 1292 / 1689 loss=4.083, nll_loss=2.456, ppl=5.49, wps=453256, ups=1.05, wpb=431591, bsz=16883.4, num_updates=24900, lr=0.000400802, gnorm=0.204, clip=0, loss_scale=2, train_wall=93, gb_free=18.8, wall=24578 epoch 015: 1392 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=460459, ups=1.06, wpb=433347, bsz=16589.8, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=24672 epoch 015: 1392 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=460459, ups=1.06, wpb=433347, bsz=16589.8, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=24672 epoch 015: 1392 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=460459, ups=1.06, wpb=433347, bsz=16589.8, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=24672 epoch 015: 1392 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=460459, ups=1.06, wpb=433347, bsz=16589.8, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=24672 epoch 015: 1392 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=460459, ups=1.06, wpb=433347, bsz=16589.8, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=24672 epoch 015: 1392 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=460459, ups=1.06, wpb=433347, bsz=16589.8, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=24672 epoch 015: 1392 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=460459, ups=1.06, wpb=433347, bsz=16589.8, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=24672 epoch 015: 1392 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=460459, ups=1.06, wpb=433347, bsz=16589.8, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=24672 epoch 015: 1392 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=460459, ups=1.06, wpb=433347, bsz=16589.8, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=24672 epoch 015: 1392 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=460459, ups=1.06, wpb=433347, bsz=16589.8, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=24672 epoch 015: 1392 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=460459, ups=1.06, wpb=433347, bsz=16589.8, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=24672 epoch 015: 1392 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=460459, ups=1.06, wpb=433347, bsz=16589.8, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=24672 epoch 015: 1392 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=460459, ups=1.06, wpb=433347, bsz=16589.8, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=24672 epoch 015: 1392 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=460459, ups=1.06, wpb=433347, bsz=16589.8, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=24672 epoch 015: 1392 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=460459, ups=1.06, wpb=433347, bsz=16589.8, num_updates=25000, lr=0.0004, gnorm=0.216, clip=0, loss_scale=2, train_wall=93, gb_free=20.1, wall=24672 begin validation on "valid" subset epoch 015 | valid on 'valid' subset | loss 4.263 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.263 epoch 015 | valid on 'valid' subset | loss 4.263 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.263 epoch 015 | valid on 'valid' subset | loss 4.263 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.263 epoch 015 | valid on 'valid' subset | loss 4.263 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.263 epoch 015 | valid on 'valid' subset | loss 4.263 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.263 epoch 015 | valid on 'valid' subset | loss 4.263 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.263 epoch 015 | valid on 'valid' subset | loss 4.263 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.263 epoch 015 | valid on 'valid' subset | loss 4.263 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.263 epoch 015 | valid on 'valid' subset | loss 4.263 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.263 epoch 015 | valid on 'valid' subset | loss 4.263 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.263 epoch 015 | valid on 'valid' subset | loss 4.263 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.263 epoch 015 | valid on 'valid' subset | loss 4.263 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.263 epoch 015 | valid on 'valid' subset | loss 4.263 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.263 epoch 015 | valid on 'valid' subset | loss 4.263 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.263 epoch 015 | valid on 'valid' subset | loss 4.263 | nll_loss 2.641 | ppl 6.24 | wps 0 | wpb 42662 | bsz 2032 | num_updates 25000 | best_loss 4.263 epoch 015: 1492 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=354311, ups=0.82, wpb=434199, bsz=16731.9, num_updates=25100, lr=0.000399202, gnorm=0.215, clip=0, loss_scale=2, train_wall=98, gb_free=19.4, wall=24795 epoch 015: 1492 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=354311, ups=0.82, wpb=434199, bsz=16731.9, num_updates=25100, lr=0.000399202, gnorm=0.215, clip=0, loss_scale=2, train_wall=98, gb_free=19.4, wall=24795 epoch 015: 1492 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=354311, ups=0.82, wpb=434199, bsz=16731.9, num_updates=25100, lr=0.000399202, gnorm=0.215, clip=0, loss_scale=2, train_wall=98, gb_free=19.4, wall=24795 epoch 015: 1492 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=354311, ups=0.82, wpb=434199, bsz=16731.9, num_updates=25100, lr=0.000399202, gnorm=0.215, clip=0, loss_scale=2, train_wall=98, gb_free=19.4, wall=24795 epoch 015: 1492 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=354311, ups=0.82, wpb=434199, bsz=16731.9, num_updates=25100, lr=0.000399202, gnorm=0.215, clip=0, loss_scale=2, train_wall=98, gb_free=19.4, wall=24795 epoch 015: 1492 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=354311, ups=0.82, wpb=434199, bsz=16731.9, num_updates=25100, lr=0.000399202, gnorm=0.215, clip=0, loss_scale=2, train_wall=98, gb_free=19.4, wall=24795 epoch 015: 1492 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=354311, ups=0.82, wpb=434199, bsz=16731.9, num_updates=25100, lr=0.000399202, gnorm=0.215, clip=0, loss_scale=2, train_wall=98, gb_free=19.4, wall=24795 epoch 015: 1492 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=354311, ups=0.82, wpb=434199, bsz=16731.9, num_updates=25100, lr=0.000399202, gnorm=0.215, clip=0, loss_scale=2, train_wall=98, gb_free=19.4, wall=24795 epoch 015: 1492 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=354311, ups=0.82, wpb=434199, bsz=16731.9, num_updates=25100, lr=0.000399202, gnorm=0.215, clip=0, loss_scale=2, train_wall=98, gb_free=19.4, wall=24795 epoch 015: 1492 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=354311, ups=0.82, wpb=434199, bsz=16731.9, num_updates=25100, lr=0.000399202, gnorm=0.215, clip=0, loss_scale=2, train_wall=98, gb_free=19.4, wall=24795 epoch 015: 1492 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=354311, ups=0.82, wpb=434199, bsz=16731.9, num_updates=25100, lr=0.000399202, gnorm=0.215, clip=0, loss_scale=2, train_wall=98, gb_free=19.4, wall=24795 epoch 015: 1492 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=354311, ups=0.82, wpb=434199, bsz=16731.9, num_updates=25100, lr=0.000399202, gnorm=0.215, clip=0, loss_scale=2, train_wall=98, gb_free=19.4, wall=24795 epoch 015: 1492 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=354311, ups=0.82, wpb=434199, bsz=16731.9, num_updates=25100, lr=0.000399202, gnorm=0.215, clip=0, loss_scale=2, train_wall=98, gb_free=19.4, wall=24795 epoch 015: 1492 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=354311, ups=0.82, wpb=434199, bsz=16731.9, num_updates=25100, lr=0.000399202, gnorm=0.215, clip=0, loss_scale=2, train_wall=98, gb_free=19.4, wall=24795 epoch 015: 1492 / 1689 loss=4.087, nll_loss=2.46, ppl=5.5, wps=354311, ups=0.82, wpb=434199, bsz=16731.9, num_updates=25100, lr=0.000399202, gnorm=0.215, clip=0, loss_scale=2, train_wall=98, gb_free=19.4, wall=24795 epoch 015: 1592 / 1689 loss=4.087, nll_loss=2.461, ppl=5.51, wps=463663, ups=1.07, wpb=435268, bsz=16521.3, num_updates=25200, lr=0.00039841, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=24889 epoch 015: 1592 / 1689 loss=4.087, nll_loss=2.461, ppl=5.51, wps=463663, ups=1.07, wpb=435268, bsz=16521.3, num_updates=25200, lr=0.00039841, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=24889 epoch 015: 1592 / 1689 loss=4.087, nll_loss=2.461, ppl=5.51, wps=463663, ups=1.07, wpb=435268, bsz=16521.3, num_updates=25200, lr=0.00039841, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=24889 epoch 015: 1592 / 1689 loss=4.087, nll_loss=2.461, ppl=5.51, wps=463663, ups=1.07, wpb=435268, bsz=16521.3, num_updates=25200, lr=0.00039841, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=24889 epoch 015: 1592 / 1689 loss=4.087, nll_loss=2.461, ppl=5.51, wps=463663, ups=1.07, wpb=435268, bsz=16521.3, num_updates=25200, lr=0.00039841, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=24889 epoch 015: 1592 / 1689 loss=4.087, nll_loss=2.461, ppl=5.51, wps=463663, ups=1.07, wpb=435268, bsz=16521.3, num_updates=25200, lr=0.00039841, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=24889 epoch 015: 1592 / 1689 loss=4.087, nll_loss=2.461, ppl=5.51, wps=463663, ups=1.07, wpb=435268, bsz=16521.3, num_updates=25200, lr=0.00039841, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=24889 epoch 015: 1592 / 1689 loss=4.087, nll_loss=2.461, ppl=5.51, wps=463663, ups=1.07, wpb=435268, bsz=16521.3, num_updates=25200, lr=0.00039841, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=24889 epoch 015: 1592 / 1689 loss=4.087, nll_loss=2.461, ppl=5.51, wps=463663, ups=1.07, wpb=435268, bsz=16521.3, num_updates=25200, lr=0.00039841, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=24889 epoch 015: 1592 / 1689 loss=4.087, nll_loss=2.461, ppl=5.51, wps=463663, ups=1.07, wpb=435268, bsz=16521.3, num_updates=25200, lr=0.00039841, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=24889 epoch 015: 1592 / 1689 loss=4.087, nll_loss=2.461, ppl=5.51, wps=463663, ups=1.07, wpb=435268, bsz=16521.3, num_updates=25200, lr=0.00039841, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=24889 epoch 015: 1592 / 1689 loss=4.087, nll_loss=2.461, ppl=5.51, wps=463663, ups=1.07, wpb=435268, bsz=16521.3, num_updates=25200, lr=0.00039841, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=24889 epoch 015: 1592 / 1689 loss=4.087, nll_loss=2.461, ppl=5.51, wps=463663, ups=1.07, wpb=435268, bsz=16521.3, num_updates=25200, lr=0.00039841, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=24889 epoch 015: 1592 / 1689 loss=4.087, nll_loss=2.461, ppl=5.51, wps=463663, ups=1.07, wpb=435268, bsz=16521.3, num_updates=25200, lr=0.00039841, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=24889 epoch 015: 1592 / 1689 loss=4.087, nll_loss=2.461, ppl=5.51, wps=463663, ups=1.07, wpb=435268, bsz=16521.3, num_updates=25200, lr=0.00039841, gnorm=0.212, clip=0, loss_scale=2, train_wall=92, gb_free=19.4, wall=24889 end of epoch 15 (average epoch stats below) epoch 015 | loss 4.075 | nll_loss 2.447 | ppl 5.45 | wps 447051 | ups 1.03 | wpb 433522 | bsz 16507.5 | num_updates 25296 | lr 0.000397653 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.9 | wall 24979 epoch 015 | loss 4.075 | nll_loss 2.447 | ppl 5.45 | wps 447051 | ups 1.03 | wpb 433522 | bsz 16507.5 | num_updates 25296 | lr 0.000397653 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.9 | wall 24979 epoch 015 | loss 4.075 | nll_loss 2.447 | ppl 5.45 | wps 447051 | ups 1.03 | wpb 433522 | bsz 16507.5 | num_updates 25296 | lr 0.000397653 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.9 | wall 24979 epoch 015 | loss 4.075 | nll_loss 2.447 | ppl 5.45 | wps 447051 | ups 1.03 | wpb 433522 | bsz 16507.5 | num_updates 25296 | lr 0.000397653 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.9 | wall 24979 epoch 015 | loss 4.075 | nll_loss 2.447 | ppl 5.45 | wps 447051 | ups 1.03 | wpb 433522 | bsz 16507.5 | num_updates 25296 | lr 0.000397653 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.9 | wall 24979 epoch 015 | loss 4.075 | nll_loss 2.447 | ppl 5.45 | wps 447051 | ups 1.03 | wpb 433522 | bsz 16507.5 | num_updates 25296 | lr 0.000397653 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.9 | wall 24979 epoch 015 | loss 4.075 | nll_loss 2.447 | ppl 5.45 | wps 447051 | ups 1.03 | wpb 433522 | bsz 16507.5 | num_updates 25296 | lr 0.000397653 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.9 | wall 24979 epoch 015 | loss 4.075 | nll_loss 2.447 | ppl 5.45 | wps 447051 | ups 1.03 | wpb 433522 | bsz 16507.5 | num_updates 25296 | lr 0.000397653 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.9 | wall 24979 epoch 015 | loss 4.075 | nll_loss 2.447 | ppl 5.45 | wps 447051 | ups 1.03 | wpb 433522 | bsz 16507.5 | num_updates 25296 | lr 0.000397653 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.9 | wall 24979 epoch 015 | loss 4.075 | nll_loss 2.447 | ppl 5.45 | wps 447051 | ups 1.03 | wpb 433522 | bsz 16507.5 | num_updates 25296 | lr 0.000397653 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.9 | wall 24979 epoch 015 | loss 4.075 | nll_loss 2.447 | ppl 5.45 | wps 447051 | ups 1.03 | wpb 433522 | bsz 16507.5 | num_updates 25296 | lr 0.000397653 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.9 | wall 24979 epoch 015 | loss 4.075 | nll_loss 2.447 | ppl 5.45 | wps 447051 | ups 1.03 | wpb 433522 | bsz 16507.5 | num_updates 25296 | lr 0.000397653 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.9 | wall 24979 epoch 015 | loss 4.075 | nll_loss 2.447 | ppl 5.45 | wps 447051 | ups 1.03 | wpb 433522 | bsz 16507.5 | num_updates 25296 | lr 0.000397653 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.9 | wall 24979 epoch 015 | loss 4.075 | nll_loss 2.447 | ppl 5.45 | wps 447051 | ups 1.03 | wpb 433522 | bsz 16507.5 | num_updates 25296 | lr 0.000397653 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.9 | wall 24979 epoch 015 | loss 4.075 | nll_loss 2.447 | ppl 5.45 | wps 447051 | ups 1.03 | wpb 433522 | bsz 16507.5 | num_updates 25296 | lr 0.000397653 | gnorm 0.213 | clip 0 | loss_scale 2 | train_wall 1570 | gb_free 19.9 | wall 24979 Start iterating over samples epoch 016: 4 / 1689 loss=4.079, nll_loss=2.452, ppl=5.47, wps=454115, ups=1.05, wpb=430595, bsz=16033.7, num_updates=25300, lr=0.000397621, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=24984 epoch 016: 4 / 1689 loss=4.079, nll_loss=2.452, ppl=5.47, wps=454115, ups=1.05, wpb=430595, bsz=16033.7, num_updates=25300, lr=0.000397621, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=24984 epoch 016: 4 / 1689 loss=4.079, nll_loss=2.452, ppl=5.47, wps=454115, ups=1.05, wpb=430595, bsz=16033.7, num_updates=25300, lr=0.000397621, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=24984 epoch 016: 4 / 1689 loss=4.079, nll_loss=2.452, ppl=5.47, wps=454115, ups=1.05, wpb=430595, bsz=16033.7, num_updates=25300, lr=0.000397621, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=24984 epoch 016: 4 / 1689 loss=4.079, nll_loss=2.452, ppl=5.47, wps=454115, ups=1.05, wpb=430595, bsz=16033.7, num_updates=25300, lr=0.000397621, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=24984 epoch 016: 4 / 1689 loss=4.079, nll_loss=2.452, ppl=5.47, wps=454115, ups=1.05, wpb=430595, bsz=16033.7, num_updates=25300, lr=0.000397621, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=24984 epoch 016: 4 / 1689 loss=4.079, nll_loss=2.452, ppl=5.47, wps=454115, ups=1.05, wpb=430595, bsz=16033.7, num_updates=25300, lr=0.000397621, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=24984 epoch 016: 4 / 1689 loss=4.079, nll_loss=2.452, ppl=5.47, wps=454115, ups=1.05, wpb=430595, bsz=16033.7, num_updates=25300, lr=0.000397621, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=24984 epoch 016: 4 / 1689 loss=4.079, nll_loss=2.452, ppl=5.47, wps=454115, ups=1.05, wpb=430595, bsz=16033.7, num_updates=25300, lr=0.000397621, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=24984 epoch 016: 4 / 1689 loss=4.079, nll_loss=2.452, ppl=5.47, wps=454115, ups=1.05, wpb=430595, bsz=16033.7, num_updates=25300, lr=0.000397621, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=24984 epoch 016: 4 / 1689 loss=4.079, nll_loss=2.452, ppl=5.47, wps=454115, ups=1.05, wpb=430595, bsz=16033.7, num_updates=25300, lr=0.000397621, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=24984 epoch 016: 4 / 1689 loss=4.079, nll_loss=2.452, ppl=5.47, wps=454115, ups=1.05, wpb=430595, bsz=16033.7, num_updates=25300, lr=0.000397621, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=24984 epoch 016: 4 / 1689 loss=4.079, nll_loss=2.452, ppl=5.47, wps=454115, ups=1.05, wpb=430595, bsz=16033.7, num_updates=25300, lr=0.000397621, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=24984 epoch 016: 4 / 1689 loss=4.079, nll_loss=2.452, ppl=5.47, wps=454115, ups=1.05, wpb=430595, bsz=16033.7, num_updates=25300, lr=0.000397621, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=24984 epoch 016: 4 / 1689 loss=4.079, nll_loss=2.452, ppl=5.47, wps=454115, ups=1.05, wpb=430595, bsz=16033.7, num_updates=25300, lr=0.000397621, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=24984 epoch 016: 4 / 1689 loss=4.079, nll_loss=2.452, ppl=5.47, wps=454115, ups=1.05, wpb=430595, bsz=16033.7, num_updates=25300, lr=0.000397621, gnorm=0.221, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=24984 epoch 016: 105 / 1689 loss=4.041, nll_loss=2.407, ppl=5.3, wps=454704, ups=1.05, wpb=431657, bsz=16523.5, num_updates=25400, lr=0.000396838, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=25079 epoch 016: 105 / 1689 loss=4.041, nll_loss=2.407, ppl=5.3, wps=454704, ups=1.05, wpb=431657, bsz=16523.5, num_updates=25400, lr=0.000396838, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=25079 epoch 016: 105 / 1689 loss=4.041, nll_loss=2.407, ppl=5.3, wps=454704, ups=1.05, wpb=431657, bsz=16523.5, num_updates=25400, lr=0.000396838, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=25079 epoch 016: 105 / 1689 loss=4.041, nll_loss=2.407, ppl=5.3, wps=454704, ups=1.05, wpb=431657, bsz=16523.5, num_updates=25400, lr=0.000396838, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=25079 epoch 016: 105 / 1689 loss=4.041, nll_loss=2.407, ppl=5.3, wps=454704, ups=1.05, wpb=431657, bsz=16523.5, num_updates=25400, lr=0.000396838, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=25079 epoch 016: 105 / 1689 loss=4.041, nll_loss=2.407, ppl=5.3, wps=454704, ups=1.05, wpb=431657, bsz=16523.5, num_updates=25400, lr=0.000396838, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=25079 epoch 016: 105 / 1689 loss=4.041, nll_loss=2.407, ppl=5.3, wps=454704, ups=1.05, wpb=431657, bsz=16523.5, num_updates=25400, lr=0.000396838, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=25079 epoch 016: 105 / 1689 loss=4.041, nll_loss=2.407, ppl=5.3, wps=454704, ups=1.05, wpb=431657, bsz=16523.5, num_updates=25400, lr=0.000396838, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=25079 epoch 016: 105 / 1689 loss=4.041, nll_loss=2.407, ppl=5.3, wps=454704, ups=1.05, wpb=431657, bsz=16523.5, num_updates=25400, lr=0.000396838, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=25079 epoch 016: 105 / 1689 loss=4.041, nll_loss=2.407, ppl=5.3, wps=454704, ups=1.05, wpb=431657, bsz=16523.5, num_updates=25400, lr=0.000396838, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=25079 epoch 016: 105 / 1689 loss=4.041, nll_loss=2.407, ppl=5.3, wps=454704, ups=1.05, wpb=431657, bsz=16523.5, num_updates=25400, lr=0.000396838, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=25079 epoch 016: 105 / 1689 loss=4.041, nll_loss=2.407, ppl=5.3, wps=454704, ups=1.05, wpb=431657, bsz=16523.5, num_updates=25400, lr=0.000396838, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=25079 epoch 016: 105 / 1689 loss=4.041, nll_loss=2.407, ppl=5.3, wps=454704, ups=1.05, wpb=431657, bsz=16523.5, num_updates=25400, lr=0.000396838, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=25079 epoch 016: 105 / 1689 loss=4.041, nll_loss=2.407, ppl=5.3, wps=454704, ups=1.05, wpb=431657, bsz=16523.5, num_updates=25400, lr=0.000396838, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=25079 epoch 016: 105 / 1689 loss=4.041, nll_loss=2.407, ppl=5.3, wps=454704, ups=1.05, wpb=431657, bsz=16523.5, num_updates=25400, lr=0.000396838, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=25079 epoch 016: 105 / 1689 loss=4.041, nll_loss=2.407, ppl=5.3, wps=454704, ups=1.05, wpb=431657, bsz=16523.5, num_updates=25400, lr=0.000396838, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=25079 epoch 016: 205 / 1689 loss=4.054, nll_loss=2.422, ppl=5.36, wps=457441, ups=1.05, wpb=434426, bsz=16681.4, num_updates=25500, lr=0.000396059, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=25174 epoch 016: 205 / 1689 loss=4.054, nll_loss=2.422, ppl=5.36, wps=457441, ups=1.05, wpb=434426, bsz=16681.4, num_updates=25500, lr=0.000396059, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=25174 epoch 016: 205 / 1689 loss=4.054, nll_loss=2.422, ppl=5.36, wps=457441, ups=1.05, wpb=434426, bsz=16681.4, num_updates=25500, lr=0.000396059, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=25174 epoch 016: 205 / 1689 loss=4.054, nll_loss=2.422, ppl=5.36, wps=457441, ups=1.05, wpb=434426, bsz=16681.4, num_updates=25500, lr=0.000396059, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=25174 epoch 016: 205 / 1689 loss=4.054, nll_loss=2.422, ppl=5.36, wps=457441, ups=1.05, wpb=434426, bsz=16681.4, num_updates=25500, lr=0.000396059, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=25174 epoch 016: 205 / 1689 loss=4.054, nll_loss=2.422, ppl=5.36, wps=457441, ups=1.05, wpb=434426, bsz=16681.4, num_updates=25500, lr=0.000396059, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=25174 epoch 016: 205 / 1689 loss=4.054, nll_loss=2.422, ppl=5.36, wps=457441, ups=1.05, wpb=434426, bsz=16681.4, num_updates=25500, lr=0.000396059, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=25174 epoch 016: 205 / 1689 loss=4.054, nll_loss=2.422, ppl=5.36, wps=457441, ups=1.05, wpb=434426, bsz=16681.4, num_updates=25500, lr=0.000396059, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=25174 epoch 016: 205 / 1689 loss=4.054, nll_loss=2.422, ppl=5.36, wps=457441, ups=1.05, wpb=434426, bsz=16681.4, num_updates=25500, lr=0.000396059, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=25174 epoch 016: 205 / 1689 loss=4.054, nll_loss=2.422, ppl=5.36, wps=457441, ups=1.05, wpb=434426, bsz=16681.4, num_updates=25500, lr=0.000396059, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=25174 epoch 016: 205 / 1689 loss=4.054, nll_loss=2.422, ppl=5.36, wps=457441, ups=1.05, wpb=434426, bsz=16681.4, num_updates=25500, lr=0.000396059, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=25174 epoch 016: 205 / 1689 loss=4.054, nll_loss=2.422, ppl=5.36, wps=457441, ups=1.05, wpb=434426, bsz=16681.4, num_updates=25500, lr=0.000396059, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=25174 epoch 016: 205 / 1689 loss=4.054, nll_loss=2.422, ppl=5.36, wps=457441, ups=1.05, wpb=434426, bsz=16681.4, num_updates=25500, lr=0.000396059, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=25174 epoch 016: 205 / 1689 loss=4.054, nll_loss=2.422, ppl=5.36, wps=457441, ups=1.05, wpb=434426, bsz=16681.4, num_updates=25500, lr=0.000396059, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=25174 epoch 016: 205 / 1689 loss=4.054, nll_loss=2.422, ppl=5.36, wps=457441, ups=1.05, wpb=434426, bsz=16681.4, num_updates=25500, lr=0.000396059, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=25174 epoch 016: 205 / 1689 loss=4.054, nll_loss=2.422, ppl=5.36, wps=457441, ups=1.05, wpb=434426, bsz=16681.4, num_updates=25500, lr=0.000396059, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=25174 epoch 016: 305 / 1689 loss=4.048, nll_loss=2.416, ppl=5.34, wps=461461, ups=1.06, wpb=435574, bsz=16225.7, num_updates=25600, lr=0.000395285, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=25268 epoch 016: 305 / 1689 loss=4.048, nll_loss=2.416, ppl=5.34, wps=461461, ups=1.06, wpb=435574, bsz=16225.7, num_updates=25600, lr=0.000395285, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=25268 epoch 016: 305 / 1689 loss=4.048, nll_loss=2.416, ppl=5.34, wps=461461, ups=1.06, wpb=435574, bsz=16225.7, num_updates=25600, lr=0.000395285, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=25268 epoch 016: 305 / 1689 loss=4.048, nll_loss=2.416, ppl=5.34, wps=461461, ups=1.06, wpb=435574, bsz=16225.7, num_updates=25600, lr=0.000395285, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=25268 epoch 016: 305 / 1689 loss=4.048, nll_loss=2.416, ppl=5.34, wps=461461, ups=1.06, wpb=435574, bsz=16225.7, num_updates=25600, lr=0.000395285, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=25268 epoch 016: 305 / 1689 loss=4.048, nll_loss=2.416, ppl=5.34, wps=461461, ups=1.06, wpb=435574, bsz=16225.7, num_updates=25600, lr=0.000395285, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=25268 epoch 016: 305 / 1689 loss=4.048, nll_loss=2.416, ppl=5.34, wps=461461, ups=1.06, wpb=435574, bsz=16225.7, num_updates=25600, lr=0.000395285, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=25268 epoch 016: 305 / 1689 loss=4.048, nll_loss=2.416, ppl=5.34, wps=461461, ups=1.06, wpb=435574, bsz=16225.7, num_updates=25600, lr=0.000395285, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=25268 epoch 016: 305 / 1689 loss=4.048, nll_loss=2.416, ppl=5.34, wps=461461, ups=1.06, wpb=435574, bsz=16225.7, num_updates=25600, lr=0.000395285, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=25268 epoch 016: 305 / 1689 loss=4.048, nll_loss=2.416, ppl=5.34, wps=461461, ups=1.06, wpb=435574, bsz=16225.7, num_updates=25600, lr=0.000395285, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=25268 epoch 016: 305 / 1689 loss=4.048, nll_loss=2.416, ppl=5.34, wps=461461, ups=1.06, wpb=435574, bsz=16225.7, num_updates=25600, lr=0.000395285, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=25268 epoch 016: 305 / 1689 loss=4.048, nll_loss=2.416, ppl=5.34, wps=461461, ups=1.06, wpb=435574, bsz=16225.7, num_updates=25600, lr=0.000395285, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=25268 epoch 016: 305 / 1689 loss=4.048, nll_loss=2.416, ppl=5.34, wps=461461, ups=1.06, wpb=435574, bsz=16225.7, num_updates=25600, lr=0.000395285, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=25268 epoch 016: 305 / 1689 loss=4.048, nll_loss=2.416, ppl=5.34, wps=461461, ups=1.06, wpb=435574, bsz=16225.7, num_updates=25600, lr=0.000395285, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=25268 epoch 016: 305 / 1689 loss=4.048, nll_loss=2.416, ppl=5.34, wps=461461, ups=1.06, wpb=435574, bsz=16225.7, num_updates=25600, lr=0.000395285, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=25268 epoch 016: 305 / 1689 loss=4.048, nll_loss=2.416, ppl=5.34, wps=461461, ups=1.06, wpb=435574, bsz=16225.7, num_updates=25600, lr=0.000395285, gnorm=0.208, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=25268 epoch 016: 405 / 1689 loss=4.056, nll_loss=2.424, ppl=5.37, wps=460338, ups=1.06, wpb=435476, bsz=16126.2, num_updates=25700, lr=0.000394515, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25363 epoch 016: 405 / 1689 loss=4.056, nll_loss=2.424, ppl=5.37, wps=460338, ups=1.06, wpb=435476, bsz=16126.2, num_updates=25700, lr=0.000394515, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25363 epoch 016: 405 / 1689 loss=4.056, nll_loss=2.424, ppl=5.37, wps=460338, ups=1.06, wpb=435476, bsz=16126.2, num_updates=25700, lr=0.000394515, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25363 epoch 016: 405 / 1689 loss=4.056, nll_loss=2.424, ppl=5.37, wps=460338, ups=1.06, wpb=435476, bsz=16126.2, num_updates=25700, lr=0.000394515, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25363 epoch 016: 405 / 1689 loss=4.056, nll_loss=2.424, ppl=5.37, wps=460338, ups=1.06, wpb=435476, bsz=16126.2, num_updates=25700, lr=0.000394515, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25363 epoch 016: 405 / 1689 loss=4.056, nll_loss=2.424, ppl=5.37, wps=460338, ups=1.06, wpb=435476, bsz=16126.2, num_updates=25700, lr=0.000394515, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25363 epoch 016: 405 / 1689 loss=4.056, nll_loss=2.424, ppl=5.37, wps=460338, ups=1.06, wpb=435476, bsz=16126.2, num_updates=25700, lr=0.000394515, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25363 epoch 016: 405 / 1689 loss=4.056, nll_loss=2.424, ppl=5.37, wps=460338, ups=1.06, wpb=435476, bsz=16126.2, num_updates=25700, lr=0.000394515, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25363 epoch 016: 405 / 1689 loss=4.056, nll_loss=2.424, ppl=5.37, wps=460338, ups=1.06, wpb=435476, bsz=16126.2, num_updates=25700, lr=0.000394515, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25363 epoch 016: 405 / 1689 loss=4.056, nll_loss=2.424, ppl=5.37, wps=460338, ups=1.06, wpb=435476, bsz=16126.2, num_updates=25700, lr=0.000394515, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25363 epoch 016: 405 / 1689 loss=4.056, nll_loss=2.424, ppl=5.37, wps=460338, ups=1.06, wpb=435476, bsz=16126.2, num_updates=25700, lr=0.000394515, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25363 epoch 016: 405 / 1689 loss=4.056, nll_loss=2.424, ppl=5.37, wps=460338, ups=1.06, wpb=435476, bsz=16126.2, num_updates=25700, lr=0.000394515, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25363 epoch 016: 405 / 1689 loss=4.056, nll_loss=2.424, ppl=5.37, wps=460338, ups=1.06, wpb=435476, bsz=16126.2, num_updates=25700, lr=0.000394515, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25363 epoch 016: 405 / 1689 loss=4.056, nll_loss=2.424, ppl=5.37, wps=460338, ups=1.06, wpb=435476, bsz=16126.2, num_updates=25700, lr=0.000394515, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25363 epoch 016: 405 / 1689 loss=4.056, nll_loss=2.424, ppl=5.37, wps=460338, ups=1.06, wpb=435476, bsz=16126.2, num_updates=25700, lr=0.000394515, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25363 epoch 016: 405 / 1689 loss=4.056, nll_loss=2.424, ppl=5.37, wps=460338, ups=1.06, wpb=435476, bsz=16126.2, num_updates=25700, lr=0.000394515, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25363 epoch 016: 505 / 1689 loss=4.058, nll_loss=2.427, ppl=5.38, wps=460430, ups=1.06, wpb=433296, bsz=16141.3, num_updates=25800, lr=0.00039375, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25457 epoch 016: 505 / 1689 loss=4.058, nll_loss=2.427, ppl=5.38, wps=460430, ups=1.06, wpb=433296, bsz=16141.3, num_updates=25800, lr=0.00039375, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25457 epoch 016: 505 / 1689 loss=4.058, nll_loss=2.427, ppl=5.38, wps=460430, ups=1.06, wpb=433296, bsz=16141.3, num_updates=25800, lr=0.00039375, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25457 epoch 016: 505 / 1689 loss=4.058, nll_loss=2.427, ppl=5.38, wps=460430, ups=1.06, wpb=433296, bsz=16141.3, num_updates=25800, lr=0.00039375, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25457 epoch 016: 505 / 1689 loss=4.058, nll_loss=2.427, ppl=5.38, wps=460430, ups=1.06, wpb=433296, bsz=16141.3, num_updates=25800, lr=0.00039375, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25457 epoch 016: 505 / 1689 loss=4.058, nll_loss=2.427, ppl=5.38, wps=460430, ups=1.06, wpb=433296, bsz=16141.3, num_updates=25800, lr=0.00039375, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25457 epoch 016: 505 / 1689 loss=4.058, nll_loss=2.427, ppl=5.38, wps=460430, ups=1.06, wpb=433296, bsz=16141.3, num_updates=25800, lr=0.00039375, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25457 epoch 016: 505 / 1689 loss=4.058, nll_loss=2.427, ppl=5.38, wps=460430, ups=1.06, wpb=433296, bsz=16141.3, num_updates=25800, lr=0.00039375, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25457 epoch 016: 505 / 1689 loss=4.058, nll_loss=2.427, ppl=5.38, wps=460430, ups=1.06, wpb=433296, bsz=16141.3, num_updates=25800, lr=0.00039375, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25457 epoch 016: 505 / 1689 loss=4.058, nll_loss=2.427, ppl=5.38, wps=460430, ups=1.06, wpb=433296, bsz=16141.3, num_updates=25800, lr=0.00039375, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25457 epoch 016: 505 / 1689 loss=4.058, nll_loss=2.427, ppl=5.38, wps=460430, ups=1.06, wpb=433296, bsz=16141.3, num_updates=25800, lr=0.00039375, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25457 epoch 016: 505 / 1689 loss=4.058, nll_loss=2.427, ppl=5.38, wps=460430, ups=1.06, wpb=433296, bsz=16141.3, num_updates=25800, lr=0.00039375, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25457 epoch 016: 505 / 1689 loss=4.058, nll_loss=2.427, ppl=5.38, wps=460430, ups=1.06, wpb=433296, bsz=16141.3, num_updates=25800, lr=0.00039375, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25457 epoch 016: 505 / 1689 loss=4.058, nll_loss=2.427, ppl=5.38, wps=460430, ups=1.06, wpb=433296, bsz=16141.3, num_updates=25800, lr=0.00039375, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25457 epoch 016: 505 / 1689 loss=4.058, nll_loss=2.427, ppl=5.38, wps=460430, ups=1.06, wpb=433296, bsz=16141.3, num_updates=25800, lr=0.00039375, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25457 epoch 016: 505 / 1689 loss=4.058, nll_loss=2.427, ppl=5.38, wps=460430, ups=1.06, wpb=433296, bsz=16141.3, num_updates=25800, lr=0.00039375, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25457 epoch 016: 605 / 1689 loss=4.068, nll_loss=2.438, ppl=5.42, wps=457009, ups=1.05, wpb=433880, bsz=16444, num_updates=25900, lr=0.000392989, gnorm=0.208, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=25552 epoch 016: 605 / 1689 loss=4.068, nll_loss=2.438, ppl=5.42, wps=457009, ups=1.05, wpb=433880, bsz=16444, num_updates=25900, lr=0.000392989, gnorm=0.208, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=25552 epoch 016: 605 / 1689 loss=4.068, nll_loss=2.438, ppl=5.42, wps=457009, ups=1.05, wpb=433880, bsz=16444, num_updates=25900, lr=0.000392989, gnorm=0.208, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=25552 epoch 016: 605 / 1689 loss=4.068, nll_loss=2.438, ppl=5.42, wps=457009, ups=1.05, wpb=433880, bsz=16444, num_updates=25900, lr=0.000392989, gnorm=0.208, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=25552 epoch 016: 605 / 1689 loss=4.068, nll_loss=2.438, ppl=5.42, wps=457009, ups=1.05, wpb=433880, bsz=16444, num_updates=25900, lr=0.000392989, gnorm=0.208, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=25552 epoch 016: 605 / 1689 loss=4.068, nll_loss=2.438, ppl=5.42, wps=457009, ups=1.05, wpb=433880, bsz=16444, num_updates=25900, lr=0.000392989, gnorm=0.208, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=25552 epoch 016: 605 / 1689 loss=4.068, nll_loss=2.438, ppl=5.42, wps=457009, ups=1.05, wpb=433880, bsz=16444, num_updates=25900, lr=0.000392989, gnorm=0.208, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=25552 epoch 016: 605 / 1689 loss=4.068, nll_loss=2.438, ppl=5.42, wps=457009, ups=1.05, wpb=433880, bsz=16444, num_updates=25900, lr=0.000392989, gnorm=0.208, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=25552 epoch 016: 605 / 1689 loss=4.068, nll_loss=2.438, ppl=5.42, wps=457009, ups=1.05, wpb=433880, bsz=16444, num_updates=25900, lr=0.000392989, gnorm=0.208, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=25552 epoch 016: 605 / 1689 loss=4.068, nll_loss=2.438, ppl=5.42, wps=457009, ups=1.05, wpb=433880, bsz=16444, num_updates=25900, lr=0.000392989, gnorm=0.208, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=25552 epoch 016: 605 / 1689 loss=4.068, nll_loss=2.438, ppl=5.42, wps=457009, ups=1.05, wpb=433880, bsz=16444, num_updates=25900, lr=0.000392989, gnorm=0.208, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=25552 epoch 016: 605 / 1689 loss=4.068, nll_loss=2.438, ppl=5.42, wps=457009, ups=1.05, wpb=433880, bsz=16444, num_updates=25900, lr=0.000392989, gnorm=0.208, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=25552 epoch 016: 605 / 1689 loss=4.068, nll_loss=2.438, ppl=5.42, wps=457009, ups=1.05, wpb=433880, bsz=16444, num_updates=25900, lr=0.000392989, gnorm=0.208, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=25552 epoch 016: 605 / 1689 loss=4.068, nll_loss=2.438, ppl=5.42, wps=457009, ups=1.05, wpb=433880, bsz=16444, num_updates=25900, lr=0.000392989, gnorm=0.208, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=25552 epoch 016: 605 / 1689 loss=4.068, nll_loss=2.438, ppl=5.42, wps=457009, ups=1.05, wpb=433880, bsz=16444, num_updates=25900, lr=0.000392989, gnorm=0.208, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=25552 epoch 016: 605 / 1689 loss=4.068, nll_loss=2.438, ppl=5.42, wps=457009, ups=1.05, wpb=433880, bsz=16444, num_updates=25900, lr=0.000392989, gnorm=0.208, clip=0, loss_scale=2, train_wall=94, gb_free=19.2, wall=25552 epoch 016: 706 / 1689 loss=4.057, nll_loss=2.427, ppl=5.38, wps=454709, ups=1.05, wpb=434069, bsz=16538.4, num_updates=26000, lr=0.000392232, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=25647 epoch 016: 706 / 1689 loss=4.057, nll_loss=2.427, ppl=5.38, wps=454709, ups=1.05, wpb=434069, bsz=16538.4, num_updates=26000, lr=0.000392232, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=25647 epoch 016: 706 / 1689 loss=4.057, nll_loss=2.427, ppl=5.38, wps=454709, ups=1.05, wpb=434069, bsz=16538.4, num_updates=26000, lr=0.000392232, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=25647 epoch 016: 706 / 1689 loss=4.057, nll_loss=2.427, ppl=5.38, wps=454709, ups=1.05, wpb=434069, bsz=16538.4, num_updates=26000, lr=0.000392232, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=25647 epoch 016: 706 / 1689 loss=4.057, nll_loss=2.427, ppl=5.38, wps=454709, ups=1.05, wpb=434069, bsz=16538.4, num_updates=26000, lr=0.000392232, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=25647 epoch 016: 706 / 1689 loss=4.057, nll_loss=2.427, ppl=5.38, wps=454709, ups=1.05, wpb=434069, bsz=16538.4, num_updates=26000, lr=0.000392232, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=25647 epoch 016: 706 / 1689 loss=4.057, nll_loss=2.427, ppl=5.38, wps=454709, ups=1.05, wpb=434069, bsz=16538.4, num_updates=26000, lr=0.000392232, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=25647 epoch 016: 706 / 1689 loss=4.057, nll_loss=2.427, ppl=5.38, wps=454709, ups=1.05, wpb=434069, bsz=16538.4, num_updates=26000, lr=0.000392232, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=25647 epoch 016: 706 / 1689 loss=4.057, nll_loss=2.427, ppl=5.38, wps=454709, ups=1.05, wpb=434069, bsz=16538.4, num_updates=26000, lr=0.000392232, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=25647 epoch 016: 706 / 1689 loss=4.057, nll_loss=2.427, ppl=5.38, wps=454709, ups=1.05, wpb=434069, bsz=16538.4, num_updates=26000, lr=0.000392232, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=25647 epoch 016: 706 / 1689 loss=4.057, nll_loss=2.427, ppl=5.38, wps=454709, ups=1.05, wpb=434069, bsz=16538.4, num_updates=26000, lr=0.000392232, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=25647 epoch 016: 706 / 1689 loss=4.057, nll_loss=2.427, ppl=5.38, wps=454709, ups=1.05, wpb=434069, bsz=16538.4, num_updates=26000, lr=0.000392232, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=25647 epoch 016: 706 / 1689 loss=4.057, nll_loss=2.427, ppl=5.38, wps=454709, ups=1.05, wpb=434069, bsz=16538.4, num_updates=26000, lr=0.000392232, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=25647 epoch 016: 706 / 1689 loss=4.057, nll_loss=2.427, ppl=5.38, wps=454709, ups=1.05, wpb=434069, bsz=16538.4, num_updates=26000, lr=0.000392232, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=25647 epoch 016: 706 / 1689 loss=4.057, nll_loss=2.427, ppl=5.38, wps=454709, ups=1.05, wpb=434069, bsz=16538.4, num_updates=26000, lr=0.000392232, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=25647 epoch 016: 706 / 1689 loss=4.057, nll_loss=2.427, ppl=5.38, wps=454709, ups=1.05, wpb=434069, bsz=16538.4, num_updates=26000, lr=0.000392232, gnorm=0.225, clip=0, loss_scale=1, train_wall=94, gb_free=19.3, wall=25647 begin validation on "valid" subset epoch 016 | valid on 'valid' subset | loss 4.267 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.263 epoch 016 | valid on 'valid' subset | loss 4.267 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.263 epoch 016 | valid on 'valid' subset | loss 4.267 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.263 epoch 016 | valid on 'valid' subset | loss 4.267 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.263 epoch 016 | valid on 'valid' subset | loss 4.267 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.263 epoch 016 | valid on 'valid' subset | loss 4.267 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.263 epoch 016 | valid on 'valid' subset | loss 4.267 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.263 epoch 016 | valid on 'valid' subset | loss 4.267 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.263 epoch 016 | valid on 'valid' subset | loss 4.267 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.263 epoch 016 | valid on 'valid' subset | loss 4.267 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.263 epoch 016 | valid on 'valid' subset | loss 4.267 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.263 epoch 016 | valid on 'valid' subset | loss 4.267 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.263 epoch 016 | valid on 'valid' subset | loss 4.267 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.263 epoch 016 | valid on 'valid' subset | loss 4.267 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.263 epoch 016 | valid on 'valid' subset | loss 4.267 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.263 epoch 016 | valid on 'valid' subset | loss 4.267 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 26000 | best_loss 4.263 epoch 016: 806 / 1689 loss=4.054, nll_loss=2.424, ppl=5.36, wps=409684, ups=0.95, wpb=432262, bsz=16586.2, num_updates=26100, lr=0.00039148, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25753 epoch 016: 806 / 1689 loss=4.054, nll_loss=2.424, ppl=5.36, wps=409684, ups=0.95, wpb=432262, bsz=16586.2, num_updates=26100, lr=0.00039148, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25753 epoch 016: 806 / 1689 loss=4.054, nll_loss=2.424, ppl=5.36, wps=409684, ups=0.95, wpb=432262, bsz=16586.2, num_updates=26100, lr=0.00039148, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25753 epoch 016: 806 / 1689 loss=4.054, nll_loss=2.424, ppl=5.36, wps=409684, ups=0.95, wpb=432262, bsz=16586.2, num_updates=26100, lr=0.00039148, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25753 epoch 016: 806 / 1689 loss=4.054, nll_loss=2.424, ppl=5.36, wps=409684, ups=0.95, wpb=432262, bsz=16586.2, num_updates=26100, lr=0.00039148, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25753 epoch 016: 806 / 1689 loss=4.054, nll_loss=2.424, ppl=5.36, wps=409684, ups=0.95, wpb=432262, bsz=16586.2, num_updates=26100, lr=0.00039148, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25753 epoch 016: 806 / 1689 loss=4.054, nll_loss=2.424, ppl=5.36, wps=409684, ups=0.95, wpb=432262, bsz=16586.2, num_updates=26100, lr=0.00039148, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25753 epoch 016: 806 / 1689 loss=4.054, nll_loss=2.424, ppl=5.36, wps=409684, ups=0.95, wpb=432262, bsz=16586.2, num_updates=26100, lr=0.00039148, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25753 epoch 016: 806 / 1689 loss=4.054, nll_loss=2.424, ppl=5.36, wps=409684, ups=0.95, wpb=432262, bsz=16586.2, num_updates=26100, lr=0.00039148, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25753 epoch 016: 806 / 1689 loss=4.054, nll_loss=2.424, ppl=5.36, wps=409684, ups=0.95, wpb=432262, bsz=16586.2, num_updates=26100, lr=0.00039148, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25753 epoch 016: 806 / 1689 loss=4.054, nll_loss=2.424, ppl=5.36, wps=409684, ups=0.95, wpb=432262, bsz=16586.2, num_updates=26100, lr=0.00039148, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25753 epoch 016: 806 / 1689 loss=4.054, nll_loss=2.424, ppl=5.36, wps=409684, ups=0.95, wpb=432262, bsz=16586.2, num_updates=26100, lr=0.00039148, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25753 epoch 016: 806 / 1689 loss=4.054, nll_loss=2.424, ppl=5.36, wps=409684, ups=0.95, wpb=432262, bsz=16586.2, num_updates=26100, lr=0.00039148, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25753 epoch 016: 806 / 1689 loss=4.054, nll_loss=2.424, ppl=5.36, wps=409684, ups=0.95, wpb=432262, bsz=16586.2, num_updates=26100, lr=0.00039148, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25753 epoch 016: 806 / 1689 loss=4.054, nll_loss=2.424, ppl=5.36, wps=409684, ups=0.95, wpb=432262, bsz=16586.2, num_updates=26100, lr=0.00039148, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25753 epoch 016: 806 / 1689 loss=4.054, nll_loss=2.424, ppl=5.36, wps=409684, ups=0.95, wpb=432262, bsz=16586.2, num_updates=26100, lr=0.00039148, gnorm=0.219, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=25753 epoch 016: 906 / 1689 loss=4.066, nll_loss=2.436, ppl=5.41, wps=465035, ups=1.07, wpb=434865, bsz=16746.4, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25846 epoch 016: 906 / 1689 loss=4.066, nll_loss=2.436, ppl=5.41, wps=465035, ups=1.07, wpb=434865, bsz=16746.4, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25846 epoch 016: 906 / 1689 loss=4.066, nll_loss=2.436, ppl=5.41, wps=465035, ups=1.07, wpb=434865, bsz=16746.4, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25846 epoch 016: 906 / 1689 loss=4.066, nll_loss=2.436, ppl=5.41, wps=465035, ups=1.07, wpb=434865, bsz=16746.4, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25846 epoch 016: 906 / 1689 loss=4.066, nll_loss=2.436, ppl=5.41, wps=465035, ups=1.07, wpb=434865, bsz=16746.4, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25846 epoch 016: 906 / 1689 loss=4.066, nll_loss=2.436, ppl=5.41, wps=465035, ups=1.07, wpb=434865, bsz=16746.4, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25846 epoch 016: 906 / 1689 loss=4.066, nll_loss=2.436, ppl=5.41, wps=465035, ups=1.07, wpb=434865, bsz=16746.4, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25846 epoch 016: 906 / 1689 loss=4.066, nll_loss=2.436, ppl=5.41, wps=465035, ups=1.07, wpb=434865, bsz=16746.4, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25846 epoch 016: 906 / 1689 loss=4.066, nll_loss=2.436, ppl=5.41, wps=465035, ups=1.07, wpb=434865, bsz=16746.4, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25846 epoch 016: 906 / 1689 loss=4.066, nll_loss=2.436, ppl=5.41, wps=465035, ups=1.07, wpb=434865, bsz=16746.4, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25846 epoch 016: 906 / 1689 loss=4.066, nll_loss=2.436, ppl=5.41, wps=465035, ups=1.07, wpb=434865, bsz=16746.4, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25846 epoch 016: 906 / 1689 loss=4.066, nll_loss=2.436, ppl=5.41, wps=465035, ups=1.07, wpb=434865, bsz=16746.4, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25846 epoch 016: 906 / 1689 loss=4.066, nll_loss=2.436, ppl=5.41, wps=465035, ups=1.07, wpb=434865, bsz=16746.4, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25846 epoch 016: 906 / 1689 loss=4.066, nll_loss=2.436, ppl=5.41, wps=465035, ups=1.07, wpb=434865, bsz=16746.4, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25846 epoch 016: 906 / 1689 loss=4.066, nll_loss=2.436, ppl=5.41, wps=465035, ups=1.07, wpb=434865, bsz=16746.4, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25846 epoch 016: 906 / 1689 loss=4.066, nll_loss=2.436, ppl=5.41, wps=465035, ups=1.07, wpb=434865, bsz=16746.4, num_updates=26200, lr=0.000390732, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=25846 epoch 016: 1006 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=465470, ups=1.07, wpb=433897, bsz=16322, num_updates=26300, lr=0.000389989, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=25939 epoch 016: 1006 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=465470, ups=1.07, wpb=433897, bsz=16322, num_updates=26300, lr=0.000389989, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=25939 epoch 016: 1006 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=465470, ups=1.07, wpb=433897, bsz=16322, num_updates=26300, lr=0.000389989, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=25939 epoch 016: 1006 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=465470, ups=1.07, wpb=433897, bsz=16322, num_updates=26300, lr=0.000389989, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=25939 epoch 016: 1006 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=465470, ups=1.07, wpb=433897, bsz=16322, num_updates=26300, lr=0.000389989, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=25939 epoch 016: 1006 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=465470, ups=1.07, wpb=433897, bsz=16322, num_updates=26300, lr=0.000389989, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=25939 epoch 016: 1006 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=465470, ups=1.07, wpb=433897, bsz=16322, num_updates=26300, lr=0.000389989, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=25939 epoch 016: 1006 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=465470, ups=1.07, wpb=433897, bsz=16322, num_updates=26300, lr=0.000389989, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=25939 epoch 016: 1006 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=465470, ups=1.07, wpb=433897, bsz=16322, num_updates=26300, lr=0.000389989, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=25939 epoch 016: 1006 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=465470, ups=1.07, wpb=433897, bsz=16322, num_updates=26300, lr=0.000389989, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=25939 epoch 016: 1006 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=465470, ups=1.07, wpb=433897, bsz=16322, num_updates=26300, lr=0.000389989, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=25939 epoch 016: 1006 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=465470, ups=1.07, wpb=433897, bsz=16322, num_updates=26300, lr=0.000389989, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=25939 epoch 016: 1006 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=465470, ups=1.07, wpb=433897, bsz=16322, num_updates=26300, lr=0.000389989, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=25939 epoch 016: 1006 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=465470, ups=1.07, wpb=433897, bsz=16322, num_updates=26300, lr=0.000389989, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=25939 epoch 016: 1006 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=465470, ups=1.07, wpb=433897, bsz=16322, num_updates=26300, lr=0.000389989, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=25939 epoch 016: 1006 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=465470, ups=1.07, wpb=433897, bsz=16322, num_updates=26300, lr=0.000389989, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=25939 epoch 016: 1106 / 1689 loss=4.072, nll_loss=2.443, ppl=5.44, wps=465229, ups=1.07, wpb=433440, bsz=16557.8, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26032 epoch 016: 1106 / 1689 loss=4.072, nll_loss=2.443, ppl=5.44, wps=465229, ups=1.07, wpb=433440, bsz=16557.8, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26032 epoch 016: 1106 / 1689 loss=4.072, nll_loss=2.443, ppl=5.44, wps=465229, ups=1.07, wpb=433440, bsz=16557.8, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26032 epoch 016: 1106 / 1689 loss=4.072, nll_loss=2.443, ppl=5.44, wps=465229, ups=1.07, wpb=433440, bsz=16557.8, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26032 epoch 016: 1106 / 1689 loss=4.072, nll_loss=2.443, ppl=5.44, wps=465229, ups=1.07, wpb=433440, bsz=16557.8, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26032 epoch 016: 1106 / 1689 loss=4.072, nll_loss=2.443, ppl=5.44, wps=465229, ups=1.07, wpb=433440, bsz=16557.8, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26032 epoch 016: 1106 / 1689 loss=4.072, nll_loss=2.443, ppl=5.44, wps=465229, ups=1.07, wpb=433440, bsz=16557.8, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26032 epoch 016: 1106 / 1689 loss=4.072, nll_loss=2.443, ppl=5.44, wps=465229, ups=1.07, wpb=433440, bsz=16557.8, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26032 epoch 016: 1106 / 1689 loss=4.072, nll_loss=2.443, ppl=5.44, wps=465229, ups=1.07, wpb=433440, bsz=16557.8, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26032 epoch 016: 1106 / 1689 loss=4.072, nll_loss=2.443, ppl=5.44, wps=465229, ups=1.07, wpb=433440, bsz=16557.8, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26032 epoch 016: 1106 / 1689 loss=4.072, nll_loss=2.443, ppl=5.44, wps=465229, ups=1.07, wpb=433440, bsz=16557.8, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26032 epoch 016: 1106 / 1689 loss=4.072, nll_loss=2.443, ppl=5.44, wps=465229, ups=1.07, wpb=433440, bsz=16557.8, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26032 epoch 016: 1106 / 1689 loss=4.072, nll_loss=2.443, ppl=5.44, wps=465229, ups=1.07, wpb=433440, bsz=16557.8, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26032 epoch 016: 1106 / 1689 loss=4.072, nll_loss=2.443, ppl=5.44, wps=465229, ups=1.07, wpb=433440, bsz=16557.8, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26032 epoch 016: 1106 / 1689 loss=4.072, nll_loss=2.443, ppl=5.44, wps=465229, ups=1.07, wpb=433440, bsz=16557.8, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26032 epoch 016: 1106 / 1689 loss=4.072, nll_loss=2.443, ppl=5.44, wps=465229, ups=1.07, wpb=433440, bsz=16557.8, num_updates=26400, lr=0.000389249, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26032 epoch 016: 1206 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=462946, ups=1.07, wpb=433887, bsz=17106.2, num_updates=26500, lr=0.000388514, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26126 epoch 016: 1206 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=462946, ups=1.07, wpb=433887, bsz=17106.2, num_updates=26500, lr=0.000388514, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26126 epoch 016: 1206 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=462946, ups=1.07, wpb=433887, bsz=17106.2, num_updates=26500, lr=0.000388514, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26126 epoch 016: 1206 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=462946, ups=1.07, wpb=433887, bsz=17106.2, num_updates=26500, lr=0.000388514, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26126 epoch 016: 1206 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=462946, ups=1.07, wpb=433887, bsz=17106.2, num_updates=26500, lr=0.000388514, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26126 epoch 016: 1206 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=462946, ups=1.07, wpb=433887, bsz=17106.2, num_updates=26500, lr=0.000388514, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26126 epoch 016: 1206 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=462946, ups=1.07, wpb=433887, bsz=17106.2, num_updates=26500, lr=0.000388514, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26126 epoch 016: 1206 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=462946, ups=1.07, wpb=433887, bsz=17106.2, num_updates=26500, lr=0.000388514, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26126 epoch 016: 1206 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=462946, ups=1.07, wpb=433887, bsz=17106.2, num_updates=26500, lr=0.000388514, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26126 epoch 016: 1206 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=462946, ups=1.07, wpb=433887, bsz=17106.2, num_updates=26500, lr=0.000388514, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26126 epoch 016: 1206 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=462946, ups=1.07, wpb=433887, bsz=17106.2, num_updates=26500, lr=0.000388514, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26126 epoch 016: 1206 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=462946, ups=1.07, wpb=433887, bsz=17106.2, num_updates=26500, lr=0.000388514, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26126 epoch 016: 1206 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=462946, ups=1.07, wpb=433887, bsz=17106.2, num_updates=26500, lr=0.000388514, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26126 epoch 016: 1206 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=462946, ups=1.07, wpb=433887, bsz=17106.2, num_updates=26500, lr=0.000388514, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26126 epoch 016: 1206 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=462946, ups=1.07, wpb=433887, bsz=17106.2, num_updates=26500, lr=0.000388514, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26126 epoch 016: 1206 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=462946, ups=1.07, wpb=433887, bsz=17106.2, num_updates=26500, lr=0.000388514, gnorm=0.211, clip=0, loss_scale=2, train_wall=93, gb_free=19.6, wall=26126 epoch 016: 1307 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=454765, ups=1.05, wpb=433282, bsz=16695.1, num_updates=26600, lr=0.000387783, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26222 epoch 016: 1307 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=454765, ups=1.05, wpb=433282, bsz=16695.1, num_updates=26600, lr=0.000387783, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26222 epoch 016: 1307 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=454765, ups=1.05, wpb=433282, bsz=16695.1, num_updates=26600, lr=0.000387783, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26222 epoch 016: 1307 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=454765, ups=1.05, wpb=433282, bsz=16695.1, num_updates=26600, lr=0.000387783, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26222 epoch 016: 1307 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=454765, ups=1.05, wpb=433282, bsz=16695.1, num_updates=26600, lr=0.000387783, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26222 epoch 016: 1307 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=454765, ups=1.05, wpb=433282, bsz=16695.1, num_updates=26600, lr=0.000387783, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26222 epoch 016: 1307 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=454765, ups=1.05, wpb=433282, bsz=16695.1, num_updates=26600, lr=0.000387783, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26222 epoch 016: 1307 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=454765, ups=1.05, wpb=433282, bsz=16695.1, num_updates=26600, lr=0.000387783, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26222 epoch 016: 1307 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=454765, ups=1.05, wpb=433282, bsz=16695.1, num_updates=26600, lr=0.000387783, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26222 epoch 016: 1307 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=454765, ups=1.05, wpb=433282, bsz=16695.1, num_updates=26600, lr=0.000387783, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26222 epoch 016: 1307 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=454765, ups=1.05, wpb=433282, bsz=16695.1, num_updates=26600, lr=0.000387783, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26222 epoch 016: 1307 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=454765, ups=1.05, wpb=433282, bsz=16695.1, num_updates=26600, lr=0.000387783, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26222 epoch 016: 1307 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=454765, ups=1.05, wpb=433282, bsz=16695.1, num_updates=26600, lr=0.000387783, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26222 epoch 016: 1307 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=454765, ups=1.05, wpb=433282, bsz=16695.1, num_updates=26600, lr=0.000387783, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26222 epoch 016: 1307 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=454765, ups=1.05, wpb=433282, bsz=16695.1, num_updates=26600, lr=0.000387783, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26222 epoch 016: 1307 / 1689 loss=4.072, nll_loss=2.444, ppl=5.44, wps=454765, ups=1.05, wpb=433282, bsz=16695.1, num_updates=26600, lr=0.000387783, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=26222 epoch 016: 1407 / 1689 loss=4.063, nll_loss=2.434, ppl=5.4, wps=458403, ups=1.06, wpb=433360, bsz=16513.4, num_updates=26700, lr=0.000387056, gnorm=0.213, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=26316 epoch 016: 1407 / 1689 loss=4.063, nll_loss=2.434, ppl=5.4, wps=458403, ups=1.06, wpb=433360, bsz=16513.4, num_updates=26700, lr=0.000387056, gnorm=0.213, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=26316 epoch 016: 1407 / 1689 loss=4.063, nll_loss=2.434, ppl=5.4, wps=458403, ups=1.06, wpb=433360, bsz=16513.4, num_updates=26700, lr=0.000387056, gnorm=0.213, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=26316 epoch 016: 1407 / 1689 loss=4.063, nll_loss=2.434, ppl=5.4, wps=458403, ups=1.06, wpb=433360, bsz=16513.4, num_updates=26700, lr=0.000387056, gnorm=0.213, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=26316 epoch 016: 1407 / 1689 loss=4.063, nll_loss=2.434, ppl=5.4, wps=458403, ups=1.06, wpb=433360, bsz=16513.4, num_updates=26700, lr=0.000387056, gnorm=0.213, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=26316 epoch 016: 1407 / 1689 loss=4.063, nll_loss=2.434, ppl=5.4, wps=458403, ups=1.06, wpb=433360, bsz=16513.4, num_updates=26700, lr=0.000387056, gnorm=0.213, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=26316 epoch 016: 1407 / 1689 loss=4.063, nll_loss=2.434, ppl=5.4, wps=458403, ups=1.06, wpb=433360, bsz=16513.4, num_updates=26700, lr=0.000387056, gnorm=0.213, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=26316 epoch 016: 1407 / 1689 loss=4.063, nll_loss=2.434, ppl=5.4, wps=458403, ups=1.06, wpb=433360, bsz=16513.4, num_updates=26700, lr=0.000387056, gnorm=0.213, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=26316 epoch 016: 1407 / 1689 loss=4.063, nll_loss=2.434, ppl=5.4, wps=458403, ups=1.06, wpb=433360, bsz=16513.4, num_updates=26700, lr=0.000387056, gnorm=0.213, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=26316 epoch 016: 1407 / 1689 loss=4.063, nll_loss=2.434, ppl=5.4, wps=458403, ups=1.06, wpb=433360, bsz=16513.4, num_updates=26700, lr=0.000387056, gnorm=0.213, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=26316 epoch 016: 1407 / 1689 loss=4.063, nll_loss=2.434, ppl=5.4, wps=458403, ups=1.06, wpb=433360, bsz=16513.4, num_updates=26700, lr=0.000387056, gnorm=0.213, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=26316 epoch 016: 1407 / 1689 loss=4.063, nll_loss=2.434, ppl=5.4, wps=458403, ups=1.06, wpb=433360, bsz=16513.4, num_updates=26700, lr=0.000387056, gnorm=0.213, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=26316 epoch 016: 1407 / 1689 loss=4.063, nll_loss=2.434, ppl=5.4, wps=458403, ups=1.06, wpb=433360, bsz=16513.4, num_updates=26700, lr=0.000387056, gnorm=0.213, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=26316 epoch 016: 1407 / 1689 loss=4.063, nll_loss=2.434, ppl=5.4, wps=458403, ups=1.06, wpb=433360, bsz=16513.4, num_updates=26700, lr=0.000387056, gnorm=0.213, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=26316 epoch 016: 1407 / 1689 loss=4.063, nll_loss=2.434, ppl=5.4, wps=458403, ups=1.06, wpb=433360, bsz=16513.4, num_updates=26700, lr=0.000387056, gnorm=0.213, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=26316 epoch 016: 1407 / 1689 loss=4.063, nll_loss=2.434, ppl=5.4, wps=458403, ups=1.06, wpb=433360, bsz=16513.4, num_updates=26700, lr=0.000387056, gnorm=0.213, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=26316 epoch 016: 1507 / 1689 loss=4.067, nll_loss=2.438, ppl=5.42, wps=458731, ups=1.06, wpb=432748, bsz=16473, num_updates=26800, lr=0.000386334, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26410 epoch 016: 1507 / 1689 loss=4.067, nll_loss=2.438, ppl=5.42, wps=458731, ups=1.06, wpb=432748, bsz=16473, num_updates=26800, lr=0.000386334, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26410 epoch 016: 1507 / 1689 loss=4.067, nll_loss=2.438, ppl=5.42, wps=458731, ups=1.06, wpb=432748, bsz=16473, num_updates=26800, lr=0.000386334, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26410 epoch 016: 1507 / 1689 loss=4.067, nll_loss=2.438, ppl=5.42, wps=458731, ups=1.06, wpb=432748, bsz=16473, num_updates=26800, lr=0.000386334, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26410 epoch 016: 1507 / 1689 loss=4.067, nll_loss=2.438, ppl=5.42, wps=458731, ups=1.06, wpb=432748, bsz=16473, num_updates=26800, lr=0.000386334, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26410 epoch 016: 1507 / 1689 loss=4.067, nll_loss=2.438, ppl=5.42, wps=458731, ups=1.06, wpb=432748, bsz=16473, num_updates=26800, lr=0.000386334, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26410 epoch 016: 1507 / 1689 loss=4.067, nll_loss=2.438, ppl=5.42, wps=458731, ups=1.06, wpb=432748, bsz=16473, num_updates=26800, lr=0.000386334, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26410 epoch 016: 1507 / 1689 loss=4.067, nll_loss=2.438, ppl=5.42, wps=458731, ups=1.06, wpb=432748, bsz=16473, num_updates=26800, lr=0.000386334, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26410 epoch 016: 1507 / 1689 loss=4.067, nll_loss=2.438, ppl=5.42, wps=458731, ups=1.06, wpb=432748, bsz=16473, num_updates=26800, lr=0.000386334, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26410 epoch 016: 1507 / 1689 loss=4.067, nll_loss=2.438, ppl=5.42, wps=458731, ups=1.06, wpb=432748, bsz=16473, num_updates=26800, lr=0.000386334, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26410 epoch 016: 1507 / 1689 loss=4.067, nll_loss=2.438, ppl=5.42, wps=458731, ups=1.06, wpb=432748, bsz=16473, num_updates=26800, lr=0.000386334, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26410 epoch 016: 1507 / 1689 loss=4.067, nll_loss=2.438, ppl=5.42, wps=458731, ups=1.06, wpb=432748, bsz=16473, num_updates=26800, lr=0.000386334, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26410 epoch 016: 1507 / 1689 loss=4.067, nll_loss=2.438, ppl=5.42, wps=458731, ups=1.06, wpb=432748, bsz=16473, num_updates=26800, lr=0.000386334, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26410 epoch 016: 1507 / 1689 loss=4.067, nll_loss=2.438, ppl=5.42, wps=458731, ups=1.06, wpb=432748, bsz=16473, num_updates=26800, lr=0.000386334, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26410 epoch 016: 1507 / 1689 loss=4.067, nll_loss=2.438, ppl=5.42, wps=458731, ups=1.06, wpb=432748, bsz=16473, num_updates=26800, lr=0.000386334, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26410 epoch 016: 1507 / 1689 loss=4.067, nll_loss=2.438, ppl=5.42, wps=458731, ups=1.06, wpb=432748, bsz=16473, num_updates=26800, lr=0.000386334, gnorm=0.216, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=26410 epoch 016: 1607 / 1689 loss=4.078, nll_loss=2.451, ppl=5.47, wps=454903, ups=1.05, wpb=432678, bsz=16404.6, num_updates=26900, lr=0.000385615, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=26505 epoch 016: 1607 / 1689 loss=4.078, nll_loss=2.451, ppl=5.47, wps=454903, ups=1.05, wpb=432678, bsz=16404.6, num_updates=26900, lr=0.000385615, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=26505 epoch 016: 1607 / 1689 loss=4.078, nll_loss=2.451, ppl=5.47, wps=454903, ups=1.05, wpb=432678, bsz=16404.6, num_updates=26900, lr=0.000385615, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=26505 epoch 016: 1607 / 1689 loss=4.078, nll_loss=2.451, ppl=5.47, wps=454903, ups=1.05, wpb=432678, bsz=16404.6, num_updates=26900, lr=0.000385615, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=26505 epoch 016: 1607 / 1689 loss=4.078, nll_loss=2.451, ppl=5.47, wps=454903, ups=1.05, wpb=432678, bsz=16404.6, num_updates=26900, lr=0.000385615, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=26505 epoch 016: 1607 / 1689 loss=4.078, nll_loss=2.451, ppl=5.47, wps=454903, ups=1.05, wpb=432678, bsz=16404.6, num_updates=26900, lr=0.000385615, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=26505 epoch 016: 1607 / 1689 loss=4.078, nll_loss=2.451, ppl=5.47, wps=454903, ups=1.05, wpb=432678, bsz=16404.6, num_updates=26900, lr=0.000385615, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=26505 epoch 016: 1607 / 1689 loss=4.078, nll_loss=2.451, ppl=5.47, wps=454903, ups=1.05, wpb=432678, bsz=16404.6, num_updates=26900, lr=0.000385615, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=26505 epoch 016: 1607 / 1689 loss=4.078, nll_loss=2.451, ppl=5.47, wps=454903, ups=1.05, wpb=432678, bsz=16404.6, num_updates=26900, lr=0.000385615, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=26505 epoch 016: 1607 / 1689 loss=4.078, nll_loss=2.451, ppl=5.47, wps=454903, ups=1.05, wpb=432678, bsz=16404.6, num_updates=26900, lr=0.000385615, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=26505 epoch 016: 1607 / 1689 loss=4.078, nll_loss=2.451, ppl=5.47, wps=454903, ups=1.05, wpb=432678, bsz=16404.6, num_updates=26900, lr=0.000385615, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=26505 epoch 016: 1607 / 1689 loss=4.078, nll_loss=2.451, ppl=5.47, wps=454903, ups=1.05, wpb=432678, bsz=16404.6, num_updates=26900, lr=0.000385615, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=26505 epoch 016: 1607 / 1689 loss=4.078, nll_loss=2.451, ppl=5.47, wps=454903, ups=1.05, wpb=432678, bsz=16404.6, num_updates=26900, lr=0.000385615, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=26505 epoch 016: 1607 / 1689 loss=4.078, nll_loss=2.451, ppl=5.47, wps=454903, ups=1.05, wpb=432678, bsz=16404.6, num_updates=26900, lr=0.000385615, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=26505 epoch 016: 1607 / 1689 loss=4.078, nll_loss=2.451, ppl=5.47, wps=454903, ups=1.05, wpb=432678, bsz=16404.6, num_updates=26900, lr=0.000385615, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=26505 epoch 016: 1607 / 1689 loss=4.078, nll_loss=2.451, ppl=5.47, wps=454903, ups=1.05, wpb=432678, bsz=16404.6, num_updates=26900, lr=0.000385615, gnorm=0.228, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=26505 end of epoch 16 (average epoch stats below) epoch 016 | loss 4.063 | nll_loss 2.433 | ppl 5.4 | wps 456008 | ups 1.05 | wpb 433566 | bsz 16508.6 | num_updates 26982 | lr 0.000385029 | gnorm 0.213 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.2 | wall 26582 epoch 016 | loss 4.063 | nll_loss 2.433 | ppl 5.4 | wps 456008 | ups 1.05 | wpb 433566 | bsz 16508.6 | num_updates 26982 | lr 0.000385029 | gnorm 0.213 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.2 | wall 26582 epoch 016 | loss 4.063 | nll_loss 2.433 | ppl 5.4 | wps 456008 | ups 1.05 | wpb 433566 | bsz 16508.6 | num_updates 26982 | lr 0.000385029 | gnorm 0.213 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.2 | wall 26582 epoch 016 | loss 4.063 | nll_loss 2.433 | ppl 5.4 | wps 456008 | ups 1.05 | wpb 433566 | bsz 16508.6 | num_updates 26982 | lr 0.000385029 | gnorm 0.213 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.2 | wall 26582 epoch 016 | loss 4.063 | nll_loss 2.433 | ppl 5.4 | wps 456008 | ups 1.05 | wpb 433566 | bsz 16508.6 | num_updates 26982 | lr 0.000385029 | gnorm 0.213 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.2 | wall 26582 epoch 016 | loss 4.063 | nll_loss 2.433 | ppl 5.4 | wps 456008 | ups 1.05 | wpb 433566 | bsz 16508.6 | num_updates 26982 | lr 0.000385029 | gnorm 0.213 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.2 | wall 26582 epoch 016 | loss 4.063 | nll_loss 2.433 | ppl 5.4 | wps 456008 | ups 1.05 | wpb 433566 | bsz 16508.6 | num_updates 26982 | lr 0.000385029 | gnorm 0.213 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.2 | wall 26582 epoch 016 | loss 4.063 | nll_loss 2.433 | ppl 5.4 | wps 456008 | ups 1.05 | wpb 433566 | bsz 16508.6 | num_updates 26982 | lr 0.000385029 | gnorm 0.213 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.2 | wall 26582 epoch 016 | loss 4.063 | nll_loss 2.433 | ppl 5.4 | wps 456008 | ups 1.05 | wpb 433566 | bsz 16508.6 | num_updates 26982 | lr 0.000385029 | gnorm 0.213 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.2 | wall 26582 epoch 016 | loss 4.063 | nll_loss 2.433 | ppl 5.4 | wps 456008 | ups 1.05 | wpb 433566 | bsz 16508.6 | num_updates 26982 | lr 0.000385029 | gnorm 0.213 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.2 | wall 26582 epoch 016 | loss 4.063 | nll_loss 2.433 | ppl 5.4 | wps 456008 | ups 1.05 | wpb 433566 | bsz 16508.6 | num_updates 26982 | lr 0.000385029 | gnorm 0.213 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.2 | wall 26582 epoch 016 | loss 4.063 | nll_loss 2.433 | ppl 5.4 | wps 456008 | ups 1.05 | wpb 433566 | bsz 16508.6 | num_updates 26982 | lr 0.000385029 | gnorm 0.213 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.2 | wall 26582 epoch 016 | loss 4.063 | nll_loss 2.433 | ppl 5.4 | wps 456008 | ups 1.05 | wpb 433566 | bsz 16508.6 | num_updates 26982 | lr 0.000385029 | gnorm 0.213 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.2 | wall 26582 epoch 016 | loss 4.063 | nll_loss 2.433 | ppl 5.4 | wps 456008 | ups 1.05 | wpb 433566 | bsz 16508.6 | num_updates 26982 | lr 0.000385029 | gnorm 0.213 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.2 | wall 26582 epoch 016 | loss 4.063 | nll_loss 2.433 | ppl 5.4 | wps 456008 | ups 1.05 | wpb 433566 | bsz 16508.6 | num_updates 26982 | lr 0.000385029 | gnorm 0.213 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.2 | wall 26582 epoch 016 | loss 4.063 | nll_loss 2.433 | ppl 5.4 | wps 456008 | ups 1.05 | wpb 433566 | bsz 16508.6 | num_updates 26982 | lr 0.000385029 | gnorm 0.213 | clip 0 | loss_scale 1 | train_wall 1568 | gb_free 20.2 | wall 26582 Start iterating over samples epoch 017: 18 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=451158, ups=1.04, wpb=432358, bsz=16604.9, num_updates=27000, lr=0.0003849, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26601 epoch 017: 18 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=451158, ups=1.04, wpb=432358, bsz=16604.9, num_updates=27000, lr=0.0003849, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26601 epoch 017: 18 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=451158, ups=1.04, wpb=432358, bsz=16604.9, num_updates=27000, lr=0.0003849, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26601 epoch 017: 18 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=451158, ups=1.04, wpb=432358, bsz=16604.9, num_updates=27000, lr=0.0003849, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26601 epoch 017: 18 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=451158, ups=1.04, wpb=432358, bsz=16604.9, num_updates=27000, lr=0.0003849, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26601 epoch 017: 18 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=451158, ups=1.04, wpb=432358, bsz=16604.9, num_updates=27000, lr=0.0003849, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26601 epoch 017: 18 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=451158, ups=1.04, wpb=432358, bsz=16604.9, num_updates=27000, lr=0.0003849, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26601 epoch 017: 18 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=451158, ups=1.04, wpb=432358, bsz=16604.9, num_updates=27000, lr=0.0003849, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26601 epoch 017: 18 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=451158, ups=1.04, wpb=432358, bsz=16604.9, num_updates=27000, lr=0.0003849, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26601 epoch 017: 18 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=451158, ups=1.04, wpb=432358, bsz=16604.9, num_updates=27000, lr=0.0003849, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26601 epoch 017: 18 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=451158, ups=1.04, wpb=432358, bsz=16604.9, num_updates=27000, lr=0.0003849, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26601 epoch 017: 18 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=451158, ups=1.04, wpb=432358, bsz=16604.9, num_updates=27000, lr=0.0003849, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26601 epoch 017: 18 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=451158, ups=1.04, wpb=432358, bsz=16604.9, num_updates=27000, lr=0.0003849, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26601 epoch 017: 18 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=451158, ups=1.04, wpb=432358, bsz=16604.9, num_updates=27000, lr=0.0003849, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26601 epoch 017: 18 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=451158, ups=1.04, wpb=432358, bsz=16604.9, num_updates=27000, lr=0.0003849, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26601 epoch 017: 18 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=451158, ups=1.04, wpb=432358, bsz=16604.9, num_updates=27000, lr=0.0003849, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26601 epoch 017: 18 / 1689 loss=4.081, nll_loss=2.454, ppl=5.48, wps=451158, ups=1.04, wpb=432358, bsz=16604.9, num_updates=27000, lr=0.0003849, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=26601 begin validation on "valid" subset epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 27000 | best_loss 4.262 epoch 017: 118 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=382825, ups=0.88, wpb=436382, bsz=16620.5, num_updates=27100, lr=0.000384189, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=26715 epoch 017: 118 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=382825, ups=0.88, wpb=436382, bsz=16620.5, num_updates=27100, lr=0.000384189, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=26715 epoch 017: 118 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=382825, ups=0.88, wpb=436382, bsz=16620.5, num_updates=27100, lr=0.000384189, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=26715 epoch 017: 118 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=382825, ups=0.88, wpb=436382, bsz=16620.5, num_updates=27100, lr=0.000384189, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=26715 epoch 017: 118 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=382825, ups=0.88, wpb=436382, bsz=16620.5, num_updates=27100, lr=0.000384189, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=26715 epoch 017: 118 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=382825, ups=0.88, wpb=436382, bsz=16620.5, num_updates=27100, lr=0.000384189, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=26715 epoch 017: 118 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=382825, ups=0.88, wpb=436382, bsz=16620.5, num_updates=27100, lr=0.000384189, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=26715 epoch 017: 118 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=382825, ups=0.88, wpb=436382, bsz=16620.5, num_updates=27100, lr=0.000384189, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=26715 epoch 017: 118 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=382825, ups=0.88, wpb=436382, bsz=16620.5, num_updates=27100, lr=0.000384189, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=26715 epoch 017: 118 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=382825, ups=0.88, wpb=436382, bsz=16620.5, num_updates=27100, lr=0.000384189, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=26715 epoch 017: 118 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=382825, ups=0.88, wpb=436382, bsz=16620.5, num_updates=27100, lr=0.000384189, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=26715 epoch 017: 118 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=382825, ups=0.88, wpb=436382, bsz=16620.5, num_updates=27100, lr=0.000384189, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=26715 epoch 017: 118 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=382825, ups=0.88, wpb=436382, bsz=16620.5, num_updates=27100, lr=0.000384189, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=26715 epoch 017: 118 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=382825, ups=0.88, wpb=436382, bsz=16620.5, num_updates=27100, lr=0.000384189, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=26715 epoch 017: 118 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=382825, ups=0.88, wpb=436382, bsz=16620.5, num_updates=27100, lr=0.000384189, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=26715 epoch 017: 118 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=382825, ups=0.88, wpb=436382, bsz=16620.5, num_updates=27100, lr=0.000384189, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=26715 epoch 017: 118 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=382825, ups=0.88, wpb=436382, bsz=16620.5, num_updates=27100, lr=0.000384189, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=26715 epoch 017: 218 / 1689 loss=4.031, nll_loss=2.396, ppl=5.26, wps=460400, ups=1.06, wpb=432979, bsz=16446.4, num_updates=27200, lr=0.000383482, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=26809 epoch 017: 218 / 1689 loss=4.031, nll_loss=2.396, ppl=5.26, wps=460400, ups=1.06, wpb=432979, bsz=16446.4, num_updates=27200, lr=0.000383482, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=26809 epoch 017: 218 / 1689 loss=4.031, nll_loss=2.396, ppl=5.26, wps=460400, ups=1.06, wpb=432979, bsz=16446.4, num_updates=27200, lr=0.000383482, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=26809 epoch 017: 218 / 1689 loss=4.031, nll_loss=2.396, ppl=5.26, wps=460400, ups=1.06, wpb=432979, bsz=16446.4, num_updates=27200, lr=0.000383482, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=26809 epoch 017: 218 / 1689 loss=4.031, nll_loss=2.396, ppl=5.26, wps=460400, ups=1.06, wpb=432979, bsz=16446.4, num_updates=27200, lr=0.000383482, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=26809 epoch 017: 218 / 1689 loss=4.031, nll_loss=2.396, ppl=5.26, wps=460400, ups=1.06, wpb=432979, bsz=16446.4, num_updates=27200, lr=0.000383482, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=26809 epoch 017: 218 / 1689 loss=4.031, nll_loss=2.396, ppl=5.26, wps=460400, ups=1.06, wpb=432979, bsz=16446.4, num_updates=27200, lr=0.000383482, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=26809 epoch 017: 218 / 1689 loss=4.031, nll_loss=2.396, ppl=5.26, wps=460400, ups=1.06, wpb=432979, bsz=16446.4, num_updates=27200, lr=0.000383482, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=26809 epoch 017: 218 / 1689 loss=4.031, nll_loss=2.396, ppl=5.26, wps=460400, ups=1.06, wpb=432979, bsz=16446.4, num_updates=27200, lr=0.000383482, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=26809 epoch 017: 218 / 1689 loss=4.031, nll_loss=2.396, ppl=5.26, wps=460400, ups=1.06, wpb=432979, bsz=16446.4, num_updates=27200, lr=0.000383482, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=26809 epoch 017: 218 / 1689 loss=4.031, nll_loss=2.396, ppl=5.26, wps=460400, ups=1.06, wpb=432979, bsz=16446.4, num_updates=27200, lr=0.000383482, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=26809 epoch 017: 218 / 1689 loss=4.031, nll_loss=2.396, ppl=5.26, wps=460400, ups=1.06, wpb=432979, bsz=16446.4, num_updates=27200, lr=0.000383482, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=26809 epoch 017: 218 / 1689 loss=4.031, nll_loss=2.396, ppl=5.26, wps=460400, ups=1.06, wpb=432979, bsz=16446.4, num_updates=27200, lr=0.000383482, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=26809 epoch 017: 218 / 1689 loss=4.031, nll_loss=2.396, ppl=5.26, wps=460400, ups=1.06, wpb=432979, bsz=16446.4, num_updates=27200, lr=0.000383482, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=26809 epoch 017: 218 / 1689 loss=4.031, nll_loss=2.396, ppl=5.26, wps=460400, ups=1.06, wpb=432979, bsz=16446.4, num_updates=27200, lr=0.000383482, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=26809 epoch 017: 218 / 1689 loss=4.031, nll_loss=2.396, ppl=5.26, wps=460400, ups=1.06, wpb=432979, bsz=16446.4, num_updates=27200, lr=0.000383482, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=26809 epoch 017: 218 / 1689 loss=4.031, nll_loss=2.396, ppl=5.26, wps=460400, ups=1.06, wpb=432979, bsz=16446.4, num_updates=27200, lr=0.000383482, gnorm=0.227, clip=0, loss_scale=2, train_wall=93, gb_free=20, wall=26809 epoch 017: 319 / 1689 loss=4.036, nll_loss=2.402, ppl=5.28, wps=455335, ups=1.05, wpb=431976, bsz=16539.5, num_updates=27300, lr=0.00038278, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=26904 epoch 017: 319 / 1689 loss=4.036, nll_loss=2.402, ppl=5.28, wps=455335, ups=1.05, wpb=431976, bsz=16539.5, num_updates=27300, lr=0.00038278, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=26904 epoch 017: 319 / 1689 loss=4.036, nll_loss=2.402, ppl=5.28, wps=455335, ups=1.05, wpb=431976, bsz=16539.5, num_updates=27300, lr=0.00038278, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=26904 epoch 017: 319 / 1689 loss=4.036, nll_loss=2.402, ppl=5.28, wps=455335, ups=1.05, wpb=431976, bsz=16539.5, num_updates=27300, lr=0.00038278, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=26904 epoch 017: 319 / 1689 loss=4.036, nll_loss=2.402, ppl=5.28, wps=455335, ups=1.05, wpb=431976, bsz=16539.5, num_updates=27300, lr=0.00038278, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=26904 epoch 017: 319 / 1689 loss=4.036, nll_loss=2.402, ppl=5.28, wps=455335, ups=1.05, wpb=431976, bsz=16539.5, num_updates=27300, lr=0.00038278, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=26904 epoch 017: 319 / 1689 loss=4.036, nll_loss=2.402, ppl=5.28, wps=455335, ups=1.05, wpb=431976, bsz=16539.5, num_updates=27300, lr=0.00038278, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=26904 epoch 017: 319 / 1689 loss=4.036, nll_loss=2.402, ppl=5.28, wps=455335, ups=1.05, wpb=431976, bsz=16539.5, num_updates=27300, lr=0.00038278, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=26904 epoch 017: 319 / 1689 loss=4.036, nll_loss=2.402, ppl=5.28, wps=455335, ups=1.05, wpb=431976, bsz=16539.5, num_updates=27300, lr=0.00038278, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=26904 epoch 017: 319 / 1689 loss=4.036, nll_loss=2.402, ppl=5.28, wps=455335, ups=1.05, wpb=431976, bsz=16539.5, num_updates=27300, lr=0.00038278, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=26904 epoch 017: 319 / 1689 loss=4.036, nll_loss=2.402, ppl=5.28, wps=455335, ups=1.05, wpb=431976, bsz=16539.5, num_updates=27300, lr=0.00038278, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=26904 epoch 017: 319 / 1689 loss=4.036, nll_loss=2.402, ppl=5.28, wps=455335, ups=1.05, wpb=431976, bsz=16539.5, num_updates=27300, lr=0.00038278, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=26904 epoch 017: 319 / 1689 loss=4.036, nll_loss=2.402, ppl=5.28, wps=455335, ups=1.05, wpb=431976, bsz=16539.5, num_updates=27300, lr=0.00038278, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=26904 epoch 017: 319 / 1689 loss=4.036, nll_loss=2.402, ppl=5.28, wps=455335, ups=1.05, wpb=431976, bsz=16539.5, num_updates=27300, lr=0.00038278, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=26904 epoch 017: 319 / 1689 loss=4.036, nll_loss=2.402, ppl=5.28, wps=455335, ups=1.05, wpb=431976, bsz=16539.5, num_updates=27300, lr=0.00038278, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=26904 epoch 017: 319 / 1689 loss=4.036, nll_loss=2.402, ppl=5.28, wps=455335, ups=1.05, wpb=431976, bsz=16539.5, num_updates=27300, lr=0.00038278, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=26904 epoch 017: 319 / 1689 loss=4.036, nll_loss=2.402, ppl=5.28, wps=455335, ups=1.05, wpb=431976, bsz=16539.5, num_updates=27300, lr=0.00038278, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=20, wall=26904 epoch 017: 419 / 1689 loss=4.05, nll_loss=2.418, ppl=5.35, wps=460924, ups=1.06, wpb=434999, bsz=16543.1, num_updates=27400, lr=0.00038208, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26999 epoch 017: 419 / 1689 loss=4.05, nll_loss=2.418, ppl=5.35, wps=460924, ups=1.06, wpb=434999, bsz=16543.1, num_updates=27400, lr=0.00038208, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26999 epoch 017: 419 / 1689 loss=4.05, nll_loss=2.418, ppl=5.35, wps=460924, ups=1.06, wpb=434999, bsz=16543.1, num_updates=27400, lr=0.00038208, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26999 epoch 017: 419 / 1689 loss=4.05, nll_loss=2.418, ppl=5.35, wps=460924, ups=1.06, wpb=434999, bsz=16543.1, num_updates=27400, lr=0.00038208, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26999 epoch 017: 419 / 1689 loss=4.05, nll_loss=2.418, ppl=5.35, wps=460924, ups=1.06, wpb=434999, bsz=16543.1, num_updates=27400, lr=0.00038208, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26999 epoch 017: 419 / 1689 loss=4.05, nll_loss=2.418, ppl=5.35, wps=460924, ups=1.06, wpb=434999, bsz=16543.1, num_updates=27400, lr=0.00038208, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26999 epoch 017: 419 / 1689 loss=4.05, nll_loss=2.418, ppl=5.35, wps=460924, ups=1.06, wpb=434999, bsz=16543.1, num_updates=27400, lr=0.00038208, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26999 epoch 017: 419 / 1689 loss=4.05, nll_loss=2.418, ppl=5.35, wps=460924, ups=1.06, wpb=434999, bsz=16543.1, num_updates=27400, lr=0.00038208, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26999 epoch 017: 419 / 1689 loss=4.05, nll_loss=2.418, ppl=5.35, wps=460924, ups=1.06, wpb=434999, bsz=16543.1, num_updates=27400, lr=0.00038208, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26999 epoch 017: 419 / 1689 loss=4.05, nll_loss=2.418, ppl=5.35, wps=460924, ups=1.06, wpb=434999, bsz=16543.1, num_updates=27400, lr=0.00038208, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26999 epoch 017: 419 / 1689 loss=4.05, nll_loss=2.418, ppl=5.35, wps=460924, ups=1.06, wpb=434999, bsz=16543.1, num_updates=27400, lr=0.00038208, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26999 epoch 017: 419 / 1689 loss=4.05, nll_loss=2.418, ppl=5.35, wps=460924, ups=1.06, wpb=434999, bsz=16543.1, num_updates=27400, lr=0.00038208, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26999 epoch 017: 419 / 1689 loss=4.05, nll_loss=2.418, ppl=5.35, wps=460924, ups=1.06, wpb=434999, bsz=16543.1, num_updates=27400, lr=0.00038208, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26999 epoch 017: 419 / 1689 loss=4.05, nll_loss=2.418, ppl=5.35, wps=460924, ups=1.06, wpb=434999, bsz=16543.1, num_updates=27400, lr=0.00038208, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26999 epoch 017: 419 / 1689 loss=4.05, nll_loss=2.418, ppl=5.35, wps=460924, ups=1.06, wpb=434999, bsz=16543.1, num_updates=27400, lr=0.00038208, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26999 epoch 017: 419 / 1689 loss=4.05, nll_loss=2.418, ppl=5.35, wps=460924, ups=1.06, wpb=434999, bsz=16543.1, num_updates=27400, lr=0.00038208, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26999 epoch 017: 419 / 1689 loss=4.05, nll_loss=2.418, ppl=5.35, wps=460924, ups=1.06, wpb=434999, bsz=16543.1, num_updates=27400, lr=0.00038208, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=26999 epoch 017: 519 / 1689 loss=4.046, nll_loss=2.414, ppl=5.33, wps=462415, ups=1.06, wpb=435091, bsz=16369.8, num_updates=27500, lr=0.000381385, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=27093 epoch 017: 519 / 1689 loss=4.046, nll_loss=2.414, ppl=5.33, wps=462415, ups=1.06, wpb=435091, bsz=16369.8, num_updates=27500, lr=0.000381385, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=27093 epoch 017: 519 / 1689 loss=4.046, nll_loss=2.414, ppl=5.33, wps=462415, ups=1.06, wpb=435091, bsz=16369.8, num_updates=27500, lr=0.000381385, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=27093 epoch 017: 519 / 1689 loss=4.046, nll_loss=2.414, ppl=5.33, wps=462415, ups=1.06, wpb=435091, bsz=16369.8, num_updates=27500, lr=0.000381385, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=27093 epoch 017: 519 / 1689 loss=4.046, nll_loss=2.414, ppl=5.33, wps=462415, ups=1.06, wpb=435091, bsz=16369.8, num_updates=27500, lr=0.000381385, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=27093 epoch 017: 519 / 1689 loss=4.046, nll_loss=2.414, ppl=5.33, wps=462415, ups=1.06, wpb=435091, bsz=16369.8, num_updates=27500, lr=0.000381385, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=27093 epoch 017: 519 / 1689 loss=4.046, nll_loss=2.414, ppl=5.33, wps=462415, ups=1.06, wpb=435091, bsz=16369.8, num_updates=27500, lr=0.000381385, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=27093 epoch 017: 519 / 1689 loss=4.046, nll_loss=2.414, ppl=5.33, wps=462415, ups=1.06, wpb=435091, bsz=16369.8, num_updates=27500, lr=0.000381385, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=27093 epoch 017: 519 / 1689 loss=4.046, nll_loss=2.414, ppl=5.33, wps=462415, ups=1.06, wpb=435091, bsz=16369.8, num_updates=27500, lr=0.000381385, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=27093 epoch 017: 519 / 1689 loss=4.046, nll_loss=2.414, ppl=5.33, wps=462415, ups=1.06, wpb=435091, bsz=16369.8, num_updates=27500, lr=0.000381385, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=27093 epoch 017: 519 / 1689 loss=4.046, nll_loss=2.414, ppl=5.33, wps=462415, ups=1.06, wpb=435091, bsz=16369.8, num_updates=27500, lr=0.000381385, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=27093 epoch 017: 519 / 1689 loss=4.046, nll_loss=2.414, ppl=5.33, wps=462415, ups=1.06, wpb=435091, bsz=16369.8, num_updates=27500, lr=0.000381385, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=27093 epoch 017: 519 / 1689 loss=4.046, nll_loss=2.414, ppl=5.33, wps=462415, ups=1.06, wpb=435091, bsz=16369.8, num_updates=27500, lr=0.000381385, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=27093 epoch 017: 519 / 1689 loss=4.046, nll_loss=2.414, ppl=5.33, wps=462415, ups=1.06, wpb=435091, bsz=16369.8, num_updates=27500, lr=0.000381385, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=27093 epoch 017: 519 / 1689 loss=4.046, nll_loss=2.414, ppl=5.33, wps=462415, ups=1.06, wpb=435091, bsz=16369.8, num_updates=27500, lr=0.000381385, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=27093 epoch 017: 519 / 1689 loss=4.046, nll_loss=2.414, ppl=5.33, wps=462415, ups=1.06, wpb=435091, bsz=16369.8, num_updates=27500, lr=0.000381385, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=27093 epoch 017: 519 / 1689 loss=4.046, nll_loss=2.414, ppl=5.33, wps=462415, ups=1.06, wpb=435091, bsz=16369.8, num_updates=27500, lr=0.000381385, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=27093 epoch 017: 619 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=459463, ups=1.06, wpb=432496, bsz=16180.1, num_updates=27600, lr=0.000380693, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=27187 epoch 017: 619 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=459463, ups=1.06, wpb=432496, bsz=16180.1, num_updates=27600, lr=0.000380693, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=27187 epoch 017: 619 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=459463, ups=1.06, wpb=432496, bsz=16180.1, num_updates=27600, lr=0.000380693, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=27187 epoch 017: 619 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=459463, ups=1.06, wpb=432496, bsz=16180.1, num_updates=27600, lr=0.000380693, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=27187 epoch 017: 619 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=459463, ups=1.06, wpb=432496, bsz=16180.1, num_updates=27600, lr=0.000380693, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=27187 epoch 017: 619 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=459463, ups=1.06, wpb=432496, bsz=16180.1, num_updates=27600, lr=0.000380693, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=27187 epoch 017: 619 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=459463, ups=1.06, wpb=432496, bsz=16180.1, num_updates=27600, lr=0.000380693, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=27187 epoch 017: 619 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=459463, ups=1.06, wpb=432496, bsz=16180.1, num_updates=27600, lr=0.000380693, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=27187 epoch 017: 619 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=459463, ups=1.06, wpb=432496, bsz=16180.1, num_updates=27600, lr=0.000380693, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=27187 epoch 017: 619 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=459463, ups=1.06, wpb=432496, bsz=16180.1, num_updates=27600, lr=0.000380693, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=27187 epoch 017: 619 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=459463, ups=1.06, wpb=432496, bsz=16180.1, num_updates=27600, lr=0.000380693, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=27187 epoch 017: 619 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=459463, ups=1.06, wpb=432496, bsz=16180.1, num_updates=27600, lr=0.000380693, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=27187 epoch 017: 619 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=459463, ups=1.06, wpb=432496, bsz=16180.1, num_updates=27600, lr=0.000380693, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=27187 epoch 017: 619 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=459463, ups=1.06, wpb=432496, bsz=16180.1, num_updates=27600, lr=0.000380693, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=27187 epoch 017: 619 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=459463, ups=1.06, wpb=432496, bsz=16180.1, num_updates=27600, lr=0.000380693, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=27187 epoch 017: 619 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=459463, ups=1.06, wpb=432496, bsz=16180.1, num_updates=27600, lr=0.000380693, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=27187 epoch 017: 619 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=459463, ups=1.06, wpb=432496, bsz=16180.1, num_updates=27600, lr=0.000380693, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=27187 epoch 017: 719 / 1689 loss=4.046, nll_loss=2.413, ppl=5.33, wps=457189, ups=1.06, wpb=432565, bsz=16407.8, num_updates=27700, lr=0.000380006, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=27281 epoch 017: 719 / 1689 loss=4.046, nll_loss=2.413, ppl=5.33, wps=457189, ups=1.06, wpb=432565, bsz=16407.8, num_updates=27700, lr=0.000380006, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=27281 epoch 017: 719 / 1689 loss=4.046, nll_loss=2.413, ppl=5.33, wps=457189, ups=1.06, wpb=432565, bsz=16407.8, num_updates=27700, lr=0.000380006, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=27281 epoch 017: 719 / 1689 loss=4.046, nll_loss=2.413, ppl=5.33, wps=457189, ups=1.06, wpb=432565, bsz=16407.8, num_updates=27700, lr=0.000380006, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=27281 epoch 017: 719 / 1689 loss=4.046, nll_loss=2.413, ppl=5.33, wps=457189, ups=1.06, wpb=432565, bsz=16407.8, num_updates=27700, lr=0.000380006, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=27281 epoch 017: 719 / 1689 loss=4.046, nll_loss=2.413, ppl=5.33, wps=457189, ups=1.06, wpb=432565, bsz=16407.8, num_updates=27700, lr=0.000380006, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=27281 epoch 017: 719 / 1689 loss=4.046, nll_loss=2.413, ppl=5.33, wps=457189, ups=1.06, wpb=432565, bsz=16407.8, num_updates=27700, lr=0.000380006, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=27281 epoch 017: 719 / 1689 loss=4.046, nll_loss=2.413, ppl=5.33, wps=457189, ups=1.06, wpb=432565, bsz=16407.8, num_updates=27700, lr=0.000380006, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=27281 epoch 017: 719 / 1689 loss=4.046, nll_loss=2.413, ppl=5.33, wps=457189, ups=1.06, wpb=432565, bsz=16407.8, num_updates=27700, lr=0.000380006, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=27281 epoch 017: 719 / 1689 loss=4.046, nll_loss=2.413, ppl=5.33, wps=457189, ups=1.06, wpb=432565, bsz=16407.8, num_updates=27700, lr=0.000380006, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=27281 epoch 017: 719 / 1689 loss=4.046, nll_loss=2.413, ppl=5.33, wps=457189, ups=1.06, wpb=432565, bsz=16407.8, num_updates=27700, lr=0.000380006, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=27281 epoch 017: 719 / 1689 loss=4.046, nll_loss=2.413, ppl=5.33, wps=457189, ups=1.06, wpb=432565, bsz=16407.8, num_updates=27700, lr=0.000380006, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=27281 epoch 017: 719 / 1689 loss=4.046, nll_loss=2.413, ppl=5.33, wps=457189, ups=1.06, wpb=432565, bsz=16407.8, num_updates=27700, lr=0.000380006, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=27281 epoch 017: 719 / 1689 loss=4.046, nll_loss=2.413, ppl=5.33, wps=457189, ups=1.06, wpb=432565, bsz=16407.8, num_updates=27700, lr=0.000380006, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=27281 epoch 017: 719 / 1689 loss=4.046, nll_loss=2.413, ppl=5.33, wps=457189, ups=1.06, wpb=432565, bsz=16407.8, num_updates=27700, lr=0.000380006, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=27281 epoch 017: 719 / 1689 loss=4.046, nll_loss=2.413, ppl=5.33, wps=457189, ups=1.06, wpb=432565, bsz=16407.8, num_updates=27700, lr=0.000380006, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=27281 epoch 017: 719 / 1689 loss=4.046, nll_loss=2.413, ppl=5.33, wps=457189, ups=1.06, wpb=432565, bsz=16407.8, num_updates=27700, lr=0.000380006, gnorm=0.207, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=27281 epoch 017: 819 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461047, ups=1.06, wpb=433233, bsz=17147.7, num_updates=27800, lr=0.000379322, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=27375 epoch 017: 819 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461047, ups=1.06, wpb=433233, bsz=17147.7, num_updates=27800, lr=0.000379322, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=27375 epoch 017: 819 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461047, ups=1.06, wpb=433233, bsz=17147.7, num_updates=27800, lr=0.000379322, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=27375 epoch 017: 819 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461047, ups=1.06, wpb=433233, bsz=17147.7, num_updates=27800, lr=0.000379322, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=27375 epoch 017: 819 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461047, ups=1.06, wpb=433233, bsz=17147.7, num_updates=27800, lr=0.000379322, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=27375 epoch 017: 819 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461047, ups=1.06, wpb=433233, bsz=17147.7, num_updates=27800, lr=0.000379322, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=27375 epoch 017: 819 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461047, ups=1.06, wpb=433233, bsz=17147.7, num_updates=27800, lr=0.000379322, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=27375 epoch 017: 819 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461047, ups=1.06, wpb=433233, bsz=17147.7, num_updates=27800, lr=0.000379322, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=27375 epoch 017: 819 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461047, ups=1.06, wpb=433233, bsz=17147.7, num_updates=27800, lr=0.000379322, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=27375 epoch 017: 819 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461047, ups=1.06, wpb=433233, bsz=17147.7, num_updates=27800, lr=0.000379322, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=27375 epoch 017: 819 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461047, ups=1.06, wpb=433233, bsz=17147.7, num_updates=27800, lr=0.000379322, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=27375 epoch 017: 819 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461047, ups=1.06, wpb=433233, bsz=17147.7, num_updates=27800, lr=0.000379322, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=27375 epoch 017: 819 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461047, ups=1.06, wpb=433233, bsz=17147.7, num_updates=27800, lr=0.000379322, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=27375 epoch 017: 819 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461047, ups=1.06, wpb=433233, bsz=17147.7, num_updates=27800, lr=0.000379322, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=27375 epoch 017: 819 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461047, ups=1.06, wpb=433233, bsz=17147.7, num_updates=27800, lr=0.000379322, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=27375 epoch 017: 819 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461047, ups=1.06, wpb=433233, bsz=17147.7, num_updates=27800, lr=0.000379322, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=27375 epoch 017: 819 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461047, ups=1.06, wpb=433233, bsz=17147.7, num_updates=27800, lr=0.000379322, gnorm=0.207, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=27375 epoch 017: 920 / 1689 loss=4.068, nll_loss=2.439, ppl=5.42, wps=454367, ups=1.05, wpb=434324, bsz=16714.4, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=27471 epoch 017: 920 / 1689 loss=4.068, nll_loss=2.439, ppl=5.42, wps=454367, ups=1.05, wpb=434324, bsz=16714.4, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=27471 epoch 017: 920 / 1689 loss=4.068, nll_loss=2.439, ppl=5.42, wps=454367, ups=1.05, wpb=434324, bsz=16714.4, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=27471 epoch 017: 920 / 1689 loss=4.068, nll_loss=2.439, ppl=5.42, wps=454367, ups=1.05, wpb=434324, bsz=16714.4, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=27471 epoch 017: 920 / 1689 loss=4.068, nll_loss=2.439, ppl=5.42, wps=454367, ups=1.05, wpb=434324, bsz=16714.4, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=27471 epoch 017: 920 / 1689 loss=4.068, nll_loss=2.439, ppl=5.42, wps=454367, ups=1.05, wpb=434324, bsz=16714.4, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=27471 epoch 017: 920 / 1689 loss=4.068, nll_loss=2.439, ppl=5.42, wps=454367, ups=1.05, wpb=434324, bsz=16714.4, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=27471 epoch 017: 920 / 1689 loss=4.068, nll_loss=2.439, ppl=5.42, wps=454367, ups=1.05, wpb=434324, bsz=16714.4, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=27471 epoch 017: 920 / 1689 loss=4.068, nll_loss=2.439, ppl=5.42, wps=454367, ups=1.05, wpb=434324, bsz=16714.4, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=27471 epoch 017: 920 / 1689 loss=4.068, nll_loss=2.439, ppl=5.42, wps=454367, ups=1.05, wpb=434324, bsz=16714.4, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=27471 epoch 017: 920 / 1689 loss=4.068, nll_loss=2.439, ppl=5.42, wps=454367, ups=1.05, wpb=434324, bsz=16714.4, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=27471 epoch 017: 920 / 1689 loss=4.068, nll_loss=2.439, ppl=5.42, wps=454367, ups=1.05, wpb=434324, bsz=16714.4, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=27471 epoch 017: 920 / 1689 loss=4.068, nll_loss=2.439, ppl=5.42, wps=454367, ups=1.05, wpb=434324, bsz=16714.4, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=27471 epoch 017: 920 / 1689 loss=4.068, nll_loss=2.439, ppl=5.42, wps=454367, ups=1.05, wpb=434324, bsz=16714.4, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=27471 epoch 017: 920 / 1689 loss=4.068, nll_loss=2.439, ppl=5.42, wps=454367, ups=1.05, wpb=434324, bsz=16714.4, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=27471 epoch 017: 920 / 1689 loss=4.068, nll_loss=2.439, ppl=5.42, wps=454367, ups=1.05, wpb=434324, bsz=16714.4, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=27471 epoch 017: 920 / 1689 loss=4.068, nll_loss=2.439, ppl=5.42, wps=454367, ups=1.05, wpb=434324, bsz=16714.4, num_updates=27900, lr=0.000378641, gnorm=0.212, clip=0, loss_scale=1, train_wall=94, gb_free=19.8, wall=27471 epoch 017: 1020 / 1689 loss=4.053, nll_loss=2.422, ppl=5.36, wps=458068, ups=1.06, wpb=433419, bsz=16239.5, num_updates=28000, lr=0.000377964, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=27566 epoch 017: 1020 / 1689 loss=4.053, nll_loss=2.422, ppl=5.36, wps=458068, ups=1.06, wpb=433419, bsz=16239.5, num_updates=28000, lr=0.000377964, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=27566 epoch 017: 1020 / 1689 loss=4.053, nll_loss=2.422, ppl=5.36, wps=458068, ups=1.06, wpb=433419, bsz=16239.5, num_updates=28000, lr=0.000377964, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=27566 epoch 017: 1020 / 1689 loss=4.053, nll_loss=2.422, ppl=5.36, wps=458068, ups=1.06, wpb=433419, bsz=16239.5, num_updates=28000, lr=0.000377964, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=27566 epoch 017: 1020 / 1689 loss=4.053, nll_loss=2.422, ppl=5.36, wps=458068, ups=1.06, wpb=433419, bsz=16239.5, num_updates=28000, lr=0.000377964, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=27566 epoch 017: 1020 / 1689 loss=4.053, nll_loss=2.422, ppl=5.36, wps=458068, ups=1.06, wpb=433419, bsz=16239.5, num_updates=28000, lr=0.000377964, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=27566 epoch 017: 1020 / 1689 loss=4.053, nll_loss=2.422, ppl=5.36, wps=458068, ups=1.06, wpb=433419, bsz=16239.5, num_updates=28000, lr=0.000377964, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=27566 epoch 017: 1020 / 1689 loss=4.053, nll_loss=2.422, ppl=5.36, wps=458068, ups=1.06, wpb=433419, bsz=16239.5, num_updates=28000, lr=0.000377964, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=27566 epoch 017: 1020 / 1689 loss=4.053, nll_loss=2.422, ppl=5.36, wps=458068, ups=1.06, wpb=433419, bsz=16239.5, num_updates=28000, lr=0.000377964, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=27566 epoch 017: 1020 / 1689 loss=4.053, nll_loss=2.422, ppl=5.36, wps=458068, ups=1.06, wpb=433419, bsz=16239.5, num_updates=28000, lr=0.000377964, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=27566 epoch 017: 1020 / 1689 loss=4.053, nll_loss=2.422, ppl=5.36, wps=458068, ups=1.06, wpb=433419, bsz=16239.5, num_updates=28000, lr=0.000377964, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=27566 epoch 017: 1020 / 1689 loss=4.053, nll_loss=2.422, ppl=5.36, wps=458068, ups=1.06, wpb=433419, bsz=16239.5, num_updates=28000, lr=0.000377964, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=27566 epoch 017: 1020 / 1689 loss=4.053, nll_loss=2.422, ppl=5.36, wps=458068, ups=1.06, wpb=433419, bsz=16239.5, num_updates=28000, lr=0.000377964, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=27566 epoch 017: 1020 / 1689 loss=4.053, nll_loss=2.422, ppl=5.36, wps=458068, ups=1.06, wpb=433419, bsz=16239.5, num_updates=28000, lr=0.000377964, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=27566 epoch 017: 1020 / 1689 loss=4.053, nll_loss=2.422, ppl=5.36, wps=458068, ups=1.06, wpb=433419, bsz=16239.5, num_updates=28000, lr=0.000377964, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=27566 epoch 017: 1020 / 1689 loss=4.053, nll_loss=2.422, ppl=5.36, wps=458068, ups=1.06, wpb=433419, bsz=16239.5, num_updates=28000, lr=0.000377964, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=27566 epoch 017: 1020 / 1689 loss=4.053, nll_loss=2.422, ppl=5.36, wps=458068, ups=1.06, wpb=433419, bsz=16239.5, num_updates=28000, lr=0.000377964, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=27566 begin validation on "valid" subset epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.644 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.644 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.644 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.644 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.644 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.644 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.644 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.644 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.644 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.644 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.644 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.644 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.644 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.644 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.644 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.644 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.262 epoch 017 | valid on 'valid' subset | loss 4.262 | nll_loss 2.644 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 28000 | best_loss 4.262 epoch 017: 1120 / 1689 loss=4.064, nll_loss=2.435, ppl=5.41, wps=318547, ups=0.73, wpb=435527, bsz=16713.7, num_updates=28100, lr=0.000377291, gnorm=0.213, clip=0, loss_scale=1, train_wall=111, gb_free=18.8, wall=27702 epoch 017: 1120 / 1689 loss=4.064, nll_loss=2.435, ppl=5.41, wps=318547, ups=0.73, wpb=435527, bsz=16713.7, num_updates=28100, lr=0.000377291, gnorm=0.213, clip=0, loss_scale=1, train_wall=111, gb_free=18.8, wall=27702 epoch 017: 1120 / 1689 loss=4.064, nll_loss=2.435, ppl=5.41, wps=318547, ups=0.73, wpb=435527, bsz=16713.7, num_updates=28100, lr=0.000377291, gnorm=0.213, clip=0, loss_scale=1, train_wall=111, gb_free=18.8, wall=27702 epoch 017: 1120 / 1689 loss=4.064, nll_loss=2.435, ppl=5.41, wps=318547, ups=0.73, wpb=435527, bsz=16713.7, num_updates=28100, lr=0.000377291, gnorm=0.213, clip=0, loss_scale=1, train_wall=111, gb_free=18.8, wall=27702 epoch 017: 1120 / 1689 loss=4.064, nll_loss=2.435, ppl=5.41, wps=318547, ups=0.73, wpb=435527, bsz=16713.7, num_updates=28100, lr=0.000377291, gnorm=0.213, clip=0, loss_scale=1, train_wall=111, gb_free=18.8, wall=27702 epoch 017: 1120 / 1689 loss=4.064, nll_loss=2.435, ppl=5.41, wps=318547, ups=0.73, wpb=435527, bsz=16713.7, num_updates=28100, lr=0.000377291, gnorm=0.213, clip=0, loss_scale=1, train_wall=111, gb_free=18.8, wall=27702 epoch 017: 1120 / 1689 loss=4.064, nll_loss=2.435, ppl=5.41, wps=318547, ups=0.73, wpb=435527, bsz=16713.7, num_updates=28100, lr=0.000377291, gnorm=0.213, clip=0, loss_scale=1, train_wall=111, gb_free=18.8, wall=27702 epoch 017: 1120 / 1689 loss=4.064, nll_loss=2.435, ppl=5.41, wps=318547, ups=0.73, wpb=435527, bsz=16713.7, num_updates=28100, lr=0.000377291, gnorm=0.213, clip=0, loss_scale=1, train_wall=111, gb_free=18.8, wall=27702 epoch 017: 1120 / 1689 loss=4.064, nll_loss=2.435, ppl=5.41, wps=318547, ups=0.73, wpb=435527, bsz=16713.7, num_updates=28100, lr=0.000377291, gnorm=0.213, clip=0, loss_scale=1, train_wall=111, gb_free=18.8, wall=27702 epoch 017: 1120 / 1689 loss=4.064, nll_loss=2.435, ppl=5.41, wps=318547, ups=0.73, wpb=435527, bsz=16713.7, num_updates=28100, lr=0.000377291, gnorm=0.213, clip=0, loss_scale=1, train_wall=111, gb_free=18.8, wall=27702 epoch 017: 1120 / 1689 loss=4.064, nll_loss=2.435, ppl=5.41, wps=318547, ups=0.73, wpb=435527, bsz=16713.7, num_updates=28100, lr=0.000377291, gnorm=0.213, clip=0, loss_scale=1, train_wall=111, gb_free=18.8, wall=27702 epoch 017: 1120 / 1689 loss=4.064, nll_loss=2.435, ppl=5.41, wps=318547, ups=0.73, wpb=435527, bsz=16713.7, num_updates=28100, lr=0.000377291, gnorm=0.213, clip=0, loss_scale=1, train_wall=111, gb_free=18.8, wall=27702 epoch 017: 1120 / 1689 loss=4.064, nll_loss=2.435, ppl=5.41, wps=318547, ups=0.73, wpb=435527, bsz=16713.7, num_updates=28100, lr=0.000377291, gnorm=0.213, clip=0, loss_scale=1, train_wall=111, gb_free=18.8, wall=27702 epoch 017: 1120 / 1689 loss=4.064, nll_loss=2.435, ppl=5.41, wps=318547, ups=0.73, wpb=435527, bsz=16713.7, num_updates=28100, lr=0.000377291, gnorm=0.213, clip=0, loss_scale=1, train_wall=111, gb_free=18.8, wall=27702 epoch 017: 1120 / 1689 loss=4.064, nll_loss=2.435, ppl=5.41, wps=318547, ups=0.73, wpb=435527, bsz=16713.7, num_updates=28100, lr=0.000377291, gnorm=0.213, clip=0, loss_scale=1, train_wall=111, gb_free=18.8, wall=27702 epoch 017: 1120 / 1689 loss=4.064, nll_loss=2.435, ppl=5.41, wps=318547, ups=0.73, wpb=435527, bsz=16713.7, num_updates=28100, lr=0.000377291, gnorm=0.213, clip=0, loss_scale=1, train_wall=111, gb_free=18.8, wall=27702 epoch 017: 1120 / 1689 loss=4.064, nll_loss=2.435, ppl=5.41, wps=318547, ups=0.73, wpb=435527, bsz=16713.7, num_updates=28100, lr=0.000377291, gnorm=0.213, clip=0, loss_scale=1, train_wall=111, gb_free=18.8, wall=27702 epoch 017: 1220 / 1689 loss=4.056, nll_loss=2.426, ppl=5.37, wps=462570, ups=1.07, wpb=433896, bsz=16273.1, num_updates=28200, lr=0.000376622, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27796 epoch 017: 1220 / 1689 loss=4.056, nll_loss=2.426, ppl=5.37, wps=462570, ups=1.07, wpb=433896, bsz=16273.1, num_updates=28200, lr=0.000376622, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27796 epoch 017: 1220 / 1689 loss=4.056, nll_loss=2.426, ppl=5.37, wps=462570, ups=1.07, wpb=433896, bsz=16273.1, num_updates=28200, lr=0.000376622, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27796 epoch 017: 1220 / 1689 loss=4.056, nll_loss=2.426, ppl=5.37, wps=462570, ups=1.07, wpb=433896, bsz=16273.1, num_updates=28200, lr=0.000376622, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27796 epoch 017: 1220 / 1689 loss=4.056, nll_loss=2.426, ppl=5.37, wps=462570, ups=1.07, wpb=433896, bsz=16273.1, num_updates=28200, lr=0.000376622, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27796 epoch 017: 1220 / 1689 loss=4.056, nll_loss=2.426, ppl=5.37, wps=462570, ups=1.07, wpb=433896, bsz=16273.1, num_updates=28200, lr=0.000376622, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27796 epoch 017: 1220 / 1689 loss=4.056, nll_loss=2.426, ppl=5.37, wps=462570, ups=1.07, wpb=433896, bsz=16273.1, num_updates=28200, lr=0.000376622, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27796 epoch 017: 1220 / 1689 loss=4.056, nll_loss=2.426, ppl=5.37, wps=462570, ups=1.07, wpb=433896, bsz=16273.1, num_updates=28200, lr=0.000376622, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27796 epoch 017: 1220 / 1689 loss=4.056, nll_loss=2.426, ppl=5.37, wps=462570, ups=1.07, wpb=433896, bsz=16273.1, num_updates=28200, lr=0.000376622, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27796 epoch 017: 1220 / 1689 loss=4.056, nll_loss=2.426, ppl=5.37, wps=462570, ups=1.07, wpb=433896, bsz=16273.1, num_updates=28200, lr=0.000376622, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27796 epoch 017: 1220 / 1689 loss=4.056, nll_loss=2.426, ppl=5.37, wps=462570, ups=1.07, wpb=433896, bsz=16273.1, num_updates=28200, lr=0.000376622, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27796 epoch 017: 1220 / 1689 loss=4.056, nll_loss=2.426, ppl=5.37, wps=462570, ups=1.07, wpb=433896, bsz=16273.1, num_updates=28200, lr=0.000376622, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27796 epoch 017: 1220 / 1689 loss=4.056, nll_loss=2.426, ppl=5.37, wps=462570, ups=1.07, wpb=433896, bsz=16273.1, num_updates=28200, lr=0.000376622, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27796 epoch 017: 1220 / 1689 loss=4.056, nll_loss=2.426, ppl=5.37, wps=462570, ups=1.07, wpb=433896, bsz=16273.1, num_updates=28200, lr=0.000376622, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27796 epoch 017: 1220 / 1689 loss=4.056, nll_loss=2.426, ppl=5.37, wps=462570, ups=1.07, wpb=433896, bsz=16273.1, num_updates=28200, lr=0.000376622, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27796 epoch 017: 1220 / 1689 loss=4.056, nll_loss=2.426, ppl=5.37, wps=462570, ups=1.07, wpb=433896, bsz=16273.1, num_updates=28200, lr=0.000376622, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27796 epoch 017: 1220 / 1689 loss=4.056, nll_loss=2.426, ppl=5.37, wps=462570, ups=1.07, wpb=433896, bsz=16273.1, num_updates=28200, lr=0.000376622, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=27796 epoch 017: 1320 / 1689 loss=4.063, nll_loss=2.433, ppl=5.4, wps=462183, ups=1.07, wpb=431844, bsz=16283.3, num_updates=28300, lr=0.000375956, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=27890 epoch 017: 1320 / 1689 loss=4.063, nll_loss=2.433, ppl=5.4, wps=462183, ups=1.07, wpb=431844, bsz=16283.3, num_updates=28300, lr=0.000375956, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=27890 epoch 017: 1320 / 1689 loss=4.063, nll_loss=2.433, ppl=5.4, wps=462183, ups=1.07, wpb=431844, bsz=16283.3, num_updates=28300, lr=0.000375956, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=27890 epoch 017: 1320 / 1689 loss=4.063, nll_loss=2.433, ppl=5.4, wps=462183, ups=1.07, wpb=431844, bsz=16283.3, num_updates=28300, lr=0.000375956, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=27890 epoch 017: 1320 / 1689 loss=4.063, nll_loss=2.433, ppl=5.4, wps=462183, ups=1.07, wpb=431844, bsz=16283.3, num_updates=28300, lr=0.000375956, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=27890 epoch 017: 1320 / 1689 loss=4.063, nll_loss=2.433, ppl=5.4, wps=462183, ups=1.07, wpb=431844, bsz=16283.3, num_updates=28300, lr=0.000375956, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=27890 epoch 017: 1320 / 1689 loss=4.063, nll_loss=2.433, ppl=5.4, wps=462183, ups=1.07, wpb=431844, bsz=16283.3, num_updates=28300, lr=0.000375956, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=27890 epoch 017: 1320 / 1689 loss=4.063, nll_loss=2.433, ppl=5.4, wps=462183, ups=1.07, wpb=431844, bsz=16283.3, num_updates=28300, lr=0.000375956, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=27890 epoch 017: 1320 / 1689 loss=4.063, nll_loss=2.433, ppl=5.4, wps=462183, ups=1.07, wpb=431844, bsz=16283.3, num_updates=28300, lr=0.000375956, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=27890 epoch 017: 1320 / 1689 loss=4.063, nll_loss=2.433, ppl=5.4, wps=462183, ups=1.07, wpb=431844, bsz=16283.3, num_updates=28300, lr=0.000375956, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=27890 epoch 017: 1320 / 1689 loss=4.063, nll_loss=2.433, ppl=5.4, wps=462183, ups=1.07, wpb=431844, bsz=16283.3, num_updates=28300, lr=0.000375956, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=27890 epoch 017: 1320 / 1689 loss=4.063, nll_loss=2.433, ppl=5.4, wps=462183, ups=1.07, wpb=431844, bsz=16283.3, num_updates=28300, lr=0.000375956, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=27890 epoch 017: 1320 / 1689 loss=4.063, nll_loss=2.433, ppl=5.4, wps=462183, ups=1.07, wpb=431844, bsz=16283.3, num_updates=28300, lr=0.000375956, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=27890 epoch 017: 1320 / 1689 loss=4.063, nll_loss=2.433, ppl=5.4, wps=462183, ups=1.07, wpb=431844, bsz=16283.3, num_updates=28300, lr=0.000375956, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=27890 epoch 017: 1320 / 1689 loss=4.063, nll_loss=2.433, ppl=5.4, wps=462183, ups=1.07, wpb=431844, bsz=16283.3, num_updates=28300, lr=0.000375956, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=27890 epoch 017: 1320 / 1689 loss=4.063, nll_loss=2.433, ppl=5.4, wps=462183, ups=1.07, wpb=431844, bsz=16283.3, num_updates=28300, lr=0.000375956, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=27890 epoch 017: 1320 / 1689 loss=4.063, nll_loss=2.433, ppl=5.4, wps=462183, ups=1.07, wpb=431844, bsz=16283.3, num_updates=28300, lr=0.000375956, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19.2, wall=27890 epoch 017: 1420 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=459050, ups=1.06, wpb=432795, bsz=16562.6, num_updates=28400, lr=0.000375293, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=27984 epoch 017: 1420 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=459050, ups=1.06, wpb=432795, bsz=16562.6, num_updates=28400, lr=0.000375293, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=27984 epoch 017: 1420 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=459050, ups=1.06, wpb=432795, bsz=16562.6, num_updates=28400, lr=0.000375293, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=27984 epoch 017: 1420 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=459050, ups=1.06, wpb=432795, bsz=16562.6, num_updates=28400, lr=0.000375293, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=27984 epoch 017: 1420 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=459050, ups=1.06, wpb=432795, bsz=16562.6, num_updates=28400, lr=0.000375293, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=27984 epoch 017: 1420 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=459050, ups=1.06, wpb=432795, bsz=16562.6, num_updates=28400, lr=0.000375293, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=27984 epoch 017: 1420 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=459050, ups=1.06, wpb=432795, bsz=16562.6, num_updates=28400, lr=0.000375293, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=27984 epoch 017: 1420 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=459050, ups=1.06, wpb=432795, bsz=16562.6, num_updates=28400, lr=0.000375293, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=27984 epoch 017: 1420 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=459050, ups=1.06, wpb=432795, bsz=16562.6, num_updates=28400, lr=0.000375293, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=27984 epoch 017: 1420 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=459050, ups=1.06, wpb=432795, bsz=16562.6, num_updates=28400, lr=0.000375293, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=27984 epoch 017: 1420 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=459050, ups=1.06, wpb=432795, bsz=16562.6, num_updates=28400, lr=0.000375293, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=27984 epoch 017: 1420 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=459050, ups=1.06, wpb=432795, bsz=16562.6, num_updates=28400, lr=0.000375293, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=27984 epoch 017: 1420 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=459050, ups=1.06, wpb=432795, bsz=16562.6, num_updates=28400, lr=0.000375293, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=27984 epoch 017: 1420 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=459050, ups=1.06, wpb=432795, bsz=16562.6, num_updates=28400, lr=0.000375293, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=27984 epoch 017: 1420 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=459050, ups=1.06, wpb=432795, bsz=16562.6, num_updates=28400, lr=0.000375293, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=27984 epoch 017: 1420 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=459050, ups=1.06, wpb=432795, bsz=16562.6, num_updates=28400, lr=0.000375293, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=27984 epoch 017: 1420 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=459050, ups=1.06, wpb=432795, bsz=16562.6, num_updates=28400, lr=0.000375293, gnorm=0.206, clip=0, loss_scale=2, train_wall=93, gb_free=18.9, wall=27984 epoch 017: 1522 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=448320, ups=1.04, wpb=432106, bsz=16650.6, num_updates=28500, lr=0.000374634, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.1, wall=28080 epoch 017: 1522 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=448320, ups=1.04, wpb=432106, bsz=16650.6, num_updates=28500, lr=0.000374634, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.1, wall=28080 epoch 017: 1522 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=448320, ups=1.04, wpb=432106, bsz=16650.6, num_updates=28500, lr=0.000374634, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.1, wall=28080 epoch 017: 1522 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=448320, ups=1.04, wpb=432106, bsz=16650.6, num_updates=28500, lr=0.000374634, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.1, wall=28080 epoch 017: 1522 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=448320, ups=1.04, wpb=432106, bsz=16650.6, num_updates=28500, lr=0.000374634, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.1, wall=28080 epoch 017: 1522 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=448320, ups=1.04, wpb=432106, bsz=16650.6, num_updates=28500, lr=0.000374634, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.1, wall=28080 epoch 017: 1522 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=448320, ups=1.04, wpb=432106, bsz=16650.6, num_updates=28500, lr=0.000374634, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.1, wall=28080 epoch 017: 1522 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=448320, ups=1.04, wpb=432106, bsz=16650.6, num_updates=28500, lr=0.000374634, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.1, wall=28080 epoch 017: 1522 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=448320, ups=1.04, wpb=432106, bsz=16650.6, num_updates=28500, lr=0.000374634, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.1, wall=28080 epoch 017: 1522 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=448320, ups=1.04, wpb=432106, bsz=16650.6, num_updates=28500, lr=0.000374634, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.1, wall=28080 epoch 017: 1522 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=448320, ups=1.04, wpb=432106, bsz=16650.6, num_updates=28500, lr=0.000374634, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.1, wall=28080 epoch 017: 1522 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=448320, ups=1.04, wpb=432106, bsz=16650.6, num_updates=28500, lr=0.000374634, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.1, wall=28080 epoch 017: 1522 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=448320, ups=1.04, wpb=432106, bsz=16650.6, num_updates=28500, lr=0.000374634, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.1, wall=28080 epoch 017: 1522 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=448320, ups=1.04, wpb=432106, bsz=16650.6, num_updates=28500, lr=0.000374634, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.1, wall=28080 epoch 017: 1522 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=448320, ups=1.04, wpb=432106, bsz=16650.6, num_updates=28500, lr=0.000374634, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.1, wall=28080 epoch 017: 1522 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=448320, ups=1.04, wpb=432106, bsz=16650.6, num_updates=28500, lr=0.000374634, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.1, wall=28080 epoch 017: 1522 / 1689 loss=4.066, nll_loss=2.437, ppl=5.41, wps=448320, ups=1.04, wpb=432106, bsz=16650.6, num_updates=28500, lr=0.000374634, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=95, gb_free=20.1, wall=28080 epoch 017: 1622 / 1689 loss=4.077, nll_loss=2.45, ppl=5.46, wps=462252, ups=1.06, wpb=435740, bsz=16711.5, num_updates=28600, lr=0.000373979, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.9, wall=28175 epoch 017: 1622 / 1689 loss=4.077, nll_loss=2.45, ppl=5.46, wps=462252, ups=1.06, wpb=435740, bsz=16711.5, num_updates=28600, lr=0.000373979, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.9, wall=28175 epoch 017: 1622 / 1689 loss=4.077, nll_loss=2.45, ppl=5.46, wps=462252, ups=1.06, wpb=435740, bsz=16711.5, num_updates=28600, lr=0.000373979, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.9, wall=28175 epoch 017: 1622 / 1689 loss=4.077, nll_loss=2.45, ppl=5.46, wps=462252, ups=1.06, wpb=435740, bsz=16711.5, num_updates=28600, lr=0.000373979, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.9, wall=28175 epoch 017: 1622 / 1689 loss=4.077, nll_loss=2.45, ppl=5.46, wps=462252, ups=1.06, wpb=435740, bsz=16711.5, num_updates=28600, lr=0.000373979, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.9, wall=28175 epoch 017: 1622 / 1689 loss=4.077, nll_loss=2.45, ppl=5.46, wps=462252, ups=1.06, wpb=435740, bsz=16711.5, num_updates=28600, lr=0.000373979, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.9, wall=28175 epoch 017: 1622 / 1689 loss=4.077, nll_loss=2.45, ppl=5.46, wps=462252, ups=1.06, wpb=435740, bsz=16711.5, num_updates=28600, lr=0.000373979, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.9, wall=28175 epoch 017: 1622 / 1689 loss=4.077, nll_loss=2.45, ppl=5.46, wps=462252, ups=1.06, wpb=435740, bsz=16711.5, num_updates=28600, lr=0.000373979, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.9, wall=28175 epoch 017: 1622 / 1689 loss=4.077, nll_loss=2.45, ppl=5.46, wps=462252, ups=1.06, wpb=435740, bsz=16711.5, num_updates=28600, lr=0.000373979, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.9, wall=28175 epoch 017: 1622 / 1689 loss=4.077, nll_loss=2.45, ppl=5.46, wps=462252, ups=1.06, wpb=435740, bsz=16711.5, num_updates=28600, lr=0.000373979, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.9, wall=28175 epoch 017: 1622 / 1689 loss=4.077, nll_loss=2.45, ppl=5.46, wps=462252, ups=1.06, wpb=435740, bsz=16711.5, num_updates=28600, lr=0.000373979, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.9, wall=28175 epoch 017: 1622 / 1689 loss=4.077, nll_loss=2.45, ppl=5.46, wps=462252, ups=1.06, wpb=435740, bsz=16711.5, num_updates=28600, lr=0.000373979, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.9, wall=28175 epoch 017: 1622 / 1689 loss=4.077, nll_loss=2.45, ppl=5.46, wps=462252, ups=1.06, wpb=435740, bsz=16711.5, num_updates=28600, lr=0.000373979, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.9, wall=28175 epoch 017: 1622 / 1689 loss=4.077, nll_loss=2.45, ppl=5.46, wps=462252, ups=1.06, wpb=435740, bsz=16711.5, num_updates=28600, lr=0.000373979, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.9, wall=28175 epoch 017: 1622 / 1689 loss=4.077, nll_loss=2.45, ppl=5.46, wps=462252, ups=1.06, wpb=435740, bsz=16711.5, num_updates=28600, lr=0.000373979, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.9, wall=28175 epoch 017: 1622 / 1689 loss=4.077, nll_loss=2.45, ppl=5.46, wps=462252, ups=1.06, wpb=435740, bsz=16711.5, num_updates=28600, lr=0.000373979, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.9, wall=28175 epoch 017: 1622 / 1689 loss=4.077, nll_loss=2.45, ppl=5.46, wps=462252, ups=1.06, wpb=435740, bsz=16711.5, num_updates=28600, lr=0.000373979, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.9, wall=28175 end of epoch 17 (average epoch stats below) epoch 017 | loss 4.052 | nll_loss 2.421 | ppl 5.36 | wps 441342 | ups 1.02 | wpb 433555 | bsz 16504.7 | num_updates 28667 | lr 0.000373542 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1589 | gb_free 19.9 | wall 28237 epoch 017 | loss 4.052 | nll_loss 2.421 | ppl 5.36 | wps 441342 | ups 1.02 | wpb 433555 | bsz 16504.7 | num_updates 28667 | lr 0.000373542 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1589 | gb_free 19.9 | wall 28237 epoch 017 | loss 4.052 | nll_loss 2.421 | ppl 5.36 | wps 441342 | ups 1.02 | wpb 433555 | bsz 16504.7 | num_updates 28667 | lr 0.000373542 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1589 | gb_free 19.9 | wall 28237 epoch 017 | loss 4.052 | nll_loss 2.421 | ppl 5.36 | wps 441342 | ups 1.02 | wpb 433555 | bsz 16504.7 | num_updates 28667 | lr 0.000373542 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1589 | gb_free 19.9 | wall 28237 epoch 017 | loss 4.052 | nll_loss 2.421 | ppl 5.36 | wps 441342 | ups 1.02 | wpb 433555 | bsz 16504.7 | num_updates 28667 | lr 0.000373542 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1589 | gb_free 19.9 | wall 28237 epoch 017 | loss 4.052 | nll_loss 2.421 | ppl 5.36 | wps 441342 | ups 1.02 | wpb 433555 | bsz 16504.7 | num_updates 28667 | lr 0.000373542 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1589 | gb_free 19.9 | wall 28237 epoch 017 | loss 4.052 | nll_loss 2.421 | ppl 5.36 | wps 441342 | ups 1.02 | wpb 433555 | bsz 16504.7 | num_updates 28667 | lr 0.000373542 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1589 | gb_free 19.9 | wall 28237 epoch 017 | loss 4.052 | nll_loss 2.421 | ppl 5.36 | wps 441342 | ups 1.02 | wpb 433555 | bsz 16504.7 | num_updates 28667 | lr 0.000373542 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1589 | gb_free 19.9 | wall 28237 epoch 017 | loss 4.052 | nll_loss 2.421 | ppl 5.36 | wps 441342 | ups 1.02 | wpb 433555 | bsz 16504.7 | num_updates 28667 | lr 0.000373542 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1589 | gb_free 19.9 | wall 28237 epoch 017 | loss 4.052 | nll_loss 2.421 | ppl 5.36 | wps 441342 | ups 1.02 | wpb 433555 | bsz 16504.7 | num_updates 28667 | lr 0.000373542 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1589 | gb_free 19.9 | wall 28237 epoch 017 | loss 4.052 | nll_loss 2.421 | ppl 5.36 | wps 441342 | ups 1.02 | wpb 433555 | bsz 16504.7 | num_updates 28667 | lr 0.000373542 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1589 | gb_free 19.9 | wall 28237 epoch 017 | loss 4.052 | nll_loss 2.421 | ppl 5.36 | wps 441342 | ups 1.02 | wpb 433555 | bsz 16504.7 | num_updates 28667 | lr 0.000373542 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1589 | gb_free 19.9 | wall 28237 epoch 017 | loss 4.052 | nll_loss 2.421 | ppl 5.36 | wps 441342 | ups 1.02 | wpb 433555 | bsz 16504.7 | num_updates 28667 | lr 0.000373542 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1589 | gb_free 19.9 | wall 28237 epoch 017 | loss 4.052 | nll_loss 2.421 | ppl 5.36 | wps 441342 | ups 1.02 | wpb 433555 | bsz 16504.7 | num_updates 28667 | lr 0.000373542 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1589 | gb_free 19.9 | wall 28237 epoch 017 | loss 4.052 | nll_loss 2.421 | ppl 5.36 | wps 441342 | ups 1.02 | wpb 433555 | bsz 16504.7 | num_updates 28667 | lr 0.000373542 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1589 | gb_free 19.9 | wall 28237 epoch 017 | loss 4.052 | nll_loss 2.421 | ppl 5.36 | wps 441342 | ups 1.02 | wpb 433555 | bsz 16504.7 | num_updates 28667 | lr 0.000373542 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1589 | gb_free 19.9 | wall 28237 epoch 017 | loss 4.052 | nll_loss 2.421 | ppl 5.36 | wps 441342 | ups 1.02 | wpb 433555 | bsz 16504.7 | num_updates 28667 | lr 0.000373542 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1589 | gb_free 19.9 | wall 28237 Start iterating over samples epoch 018: 33 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=451552, ups=1.05, wpb=431316, bsz=16082.6, num_updates=28700, lr=0.000373327, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.3, wall=28270 epoch 018: 33 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=451552, ups=1.05, wpb=431316, bsz=16082.6, num_updates=28700, lr=0.000373327, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.3, wall=28270 epoch 018: 33 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=451552, ups=1.05, wpb=431316, bsz=16082.6, num_updates=28700, lr=0.000373327, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.3, wall=28270 epoch 018: 33 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=451552, ups=1.05, wpb=431316, bsz=16082.6, num_updates=28700, lr=0.000373327, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.3, wall=28270 epoch 018: 33 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=451552, ups=1.05, wpb=431316, bsz=16082.6, num_updates=28700, lr=0.000373327, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.3, wall=28270 epoch 018: 33 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=451552, ups=1.05, wpb=431316, bsz=16082.6, num_updates=28700, lr=0.000373327, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.3, wall=28270 epoch 018: 33 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=451552, ups=1.05, wpb=431316, bsz=16082.6, num_updates=28700, lr=0.000373327, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.3, wall=28270 epoch 018: 33 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=451552, ups=1.05, wpb=431316, bsz=16082.6, num_updates=28700, lr=0.000373327, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.3, wall=28270 epoch 018: 33 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=451552, ups=1.05, wpb=431316, bsz=16082.6, num_updates=28700, lr=0.000373327, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.3, wall=28270 epoch 018: 33 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=451552, ups=1.05, wpb=431316, bsz=16082.6, num_updates=28700, lr=0.000373327, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.3, wall=28270 epoch 018: 33 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=451552, ups=1.05, wpb=431316, bsz=16082.6, num_updates=28700, lr=0.000373327, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.3, wall=28270 epoch 018: 33 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=451552, ups=1.05, wpb=431316, bsz=16082.6, num_updates=28700, lr=0.000373327, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.3, wall=28270 epoch 018: 33 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=451552, ups=1.05, wpb=431316, bsz=16082.6, num_updates=28700, lr=0.000373327, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.3, wall=28270 epoch 018: 33 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=451552, ups=1.05, wpb=431316, bsz=16082.6, num_updates=28700, lr=0.000373327, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.3, wall=28270 epoch 018: 33 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=451552, ups=1.05, wpb=431316, bsz=16082.6, num_updates=28700, lr=0.000373327, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.3, wall=28270 epoch 018: 33 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=451552, ups=1.05, wpb=431316, bsz=16082.6, num_updates=28700, lr=0.000373327, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.3, wall=28270 epoch 018: 33 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=451552, ups=1.05, wpb=431316, bsz=16082.6, num_updates=28700, lr=0.000373327, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.3, wall=28270 epoch 018: 33 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=451552, ups=1.05, wpb=431316, bsz=16082.6, num_updates=28700, lr=0.000373327, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.3, wall=28270 epoch 018: 133 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=462372, ups=1.06, wpb=434754, bsz=16415.5, num_updates=28800, lr=0.000372678, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=28364 epoch 018: 133 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=462372, ups=1.06, wpb=434754, bsz=16415.5, num_updates=28800, lr=0.000372678, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=28364 epoch 018: 133 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=462372, ups=1.06, wpb=434754, bsz=16415.5, num_updates=28800, lr=0.000372678, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=28364 epoch 018: 133 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=462372, ups=1.06, wpb=434754, bsz=16415.5, num_updates=28800, lr=0.000372678, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=28364 epoch 018: 133 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=462372, ups=1.06, wpb=434754, bsz=16415.5, num_updates=28800, lr=0.000372678, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=28364 epoch 018: 133 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=462372, ups=1.06, wpb=434754, bsz=16415.5, num_updates=28800, lr=0.000372678, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=28364 epoch 018: 133 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=462372, ups=1.06, wpb=434754, bsz=16415.5, num_updates=28800, lr=0.000372678, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=28364 epoch 018: 133 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=462372, ups=1.06, wpb=434754, bsz=16415.5, num_updates=28800, lr=0.000372678, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=28364 epoch 018: 133 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=462372, ups=1.06, wpb=434754, bsz=16415.5, num_updates=28800, lr=0.000372678, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=28364 epoch 018: 133 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=462372, ups=1.06, wpb=434754, bsz=16415.5, num_updates=28800, lr=0.000372678, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=28364 epoch 018: 133 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=462372, ups=1.06, wpb=434754, bsz=16415.5, num_updates=28800, lr=0.000372678, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=28364 epoch 018: 133 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=462372, ups=1.06, wpb=434754, bsz=16415.5, num_updates=28800, lr=0.000372678, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=28364 epoch 018: 133 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=462372, ups=1.06, wpb=434754, bsz=16415.5, num_updates=28800, lr=0.000372678, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=28364 epoch 018: 133 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=462372, ups=1.06, wpb=434754, bsz=16415.5, num_updates=28800, lr=0.000372678, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=28364 epoch 018: 133 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=462372, ups=1.06, wpb=434754, bsz=16415.5, num_updates=28800, lr=0.000372678, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=28364 epoch 018: 133 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=462372, ups=1.06, wpb=434754, bsz=16415.5, num_updates=28800, lr=0.000372678, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=28364 epoch 018: 133 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=462372, ups=1.06, wpb=434754, bsz=16415.5, num_updates=28800, lr=0.000372678, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=28364 epoch 018: 133 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=462372, ups=1.06, wpb=434754, bsz=16415.5, num_updates=28800, lr=0.000372678, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=28364 epoch 018: 233 / 1689 loss=4.032, nll_loss=2.398, ppl=5.27, wps=461940, ups=1.06, wpb=436422, bsz=16453.7, num_updates=28900, lr=0.000372033, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=28459 epoch 018: 233 / 1689 loss=4.032, nll_loss=2.398, ppl=5.27, wps=461940, ups=1.06, wpb=436422, bsz=16453.7, num_updates=28900, lr=0.000372033, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=28459 epoch 018: 233 / 1689 loss=4.032, nll_loss=2.398, ppl=5.27, wps=461940, ups=1.06, wpb=436422, bsz=16453.7, num_updates=28900, lr=0.000372033, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=28459 epoch 018: 233 / 1689 loss=4.032, nll_loss=2.398, ppl=5.27, wps=461940, ups=1.06, wpb=436422, bsz=16453.7, num_updates=28900, lr=0.000372033, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=28459 epoch 018: 233 / 1689 loss=4.032, nll_loss=2.398, ppl=5.27, wps=461940, ups=1.06, wpb=436422, bsz=16453.7, num_updates=28900, lr=0.000372033, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=28459 epoch 018: 233 / 1689 loss=4.032, nll_loss=2.398, ppl=5.27, wps=461940, ups=1.06, wpb=436422, bsz=16453.7, num_updates=28900, lr=0.000372033, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=28459 epoch 018: 233 / 1689 loss=4.032, nll_loss=2.398, ppl=5.27, wps=461940, ups=1.06, wpb=436422, bsz=16453.7, num_updates=28900, lr=0.000372033, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=28459 epoch 018: 233 / 1689 loss=4.032, nll_loss=2.398, ppl=5.27, wps=461940, ups=1.06, wpb=436422, bsz=16453.7, num_updates=28900, lr=0.000372033, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=28459 epoch 018: 233 / 1689 loss=4.032, nll_loss=2.398, ppl=5.27, wps=461940, ups=1.06, wpb=436422, bsz=16453.7, num_updates=28900, lr=0.000372033, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=28459 epoch 018: 233 / 1689 loss=4.032, nll_loss=2.398, ppl=5.27, wps=461940, ups=1.06, wpb=436422, bsz=16453.7, num_updates=28900, lr=0.000372033, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=28459 epoch 018: 233 / 1689 loss=4.032, nll_loss=2.398, ppl=5.27, wps=461940, ups=1.06, wpb=436422, bsz=16453.7, num_updates=28900, lr=0.000372033, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=28459 epoch 018: 233 / 1689 loss=4.032, nll_loss=2.398, ppl=5.27, wps=461940, ups=1.06, wpb=436422, bsz=16453.7, num_updates=28900, lr=0.000372033, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=28459 epoch 018: 233 / 1689 loss=4.032, nll_loss=2.398, ppl=5.27, wps=461940, ups=1.06, wpb=436422, bsz=16453.7, num_updates=28900, lr=0.000372033, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=28459 epoch 018: 233 / 1689 loss=4.032, nll_loss=2.398, ppl=5.27, wps=461940, ups=1.06, wpb=436422, bsz=16453.7, num_updates=28900, lr=0.000372033, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=28459 epoch 018: 233 / 1689 loss=4.032, nll_loss=2.398, ppl=5.27, wps=461940, ups=1.06, wpb=436422, bsz=16453.7, num_updates=28900, lr=0.000372033, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=28459 epoch 018: 233 / 1689 loss=4.032, nll_loss=2.398, ppl=5.27, wps=461940, ups=1.06, wpb=436422, bsz=16453.7, num_updates=28900, lr=0.000372033, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=28459 epoch 018: 233 / 1689 loss=4.032, nll_loss=2.398, ppl=5.27, wps=461940, ups=1.06, wpb=436422, bsz=16453.7, num_updates=28900, lr=0.000372033, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=28459 epoch 018: 233 / 1689 loss=4.032, nll_loss=2.398, ppl=5.27, wps=461940, ups=1.06, wpb=436422, bsz=16453.7, num_updates=28900, lr=0.000372033, gnorm=0.194, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=28459 epoch 018: 333 / 1689 loss=4.041, nll_loss=2.408, ppl=5.31, wps=459920, ups=1.06, wpb=433687, bsz=16557.2, num_updates=29000, lr=0.000371391, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28553 epoch 018: 333 / 1689 loss=4.041, nll_loss=2.408, ppl=5.31, wps=459920, ups=1.06, wpb=433687, bsz=16557.2, num_updates=29000, lr=0.000371391, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28553 epoch 018: 333 / 1689 loss=4.041, nll_loss=2.408, ppl=5.31, wps=459920, ups=1.06, wpb=433687, bsz=16557.2, num_updates=29000, lr=0.000371391, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28553 epoch 018: 333 / 1689 loss=4.041, nll_loss=2.408, ppl=5.31, wps=459920, ups=1.06, wpb=433687, bsz=16557.2, num_updates=29000, lr=0.000371391, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28553 epoch 018: 333 / 1689 loss=4.041, nll_loss=2.408, ppl=5.31, wps=459920, ups=1.06, wpb=433687, bsz=16557.2, num_updates=29000, lr=0.000371391, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28553 epoch 018: 333 / 1689 loss=4.041, nll_loss=2.408, ppl=5.31, wps=459920, ups=1.06, wpb=433687, bsz=16557.2, num_updates=29000, lr=0.000371391, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28553 epoch 018: 333 / 1689 loss=4.041, nll_loss=2.408, ppl=5.31, wps=459920, ups=1.06, wpb=433687, bsz=16557.2, num_updates=29000, lr=0.000371391, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28553 epoch 018: 333 / 1689 loss=4.041, nll_loss=2.408, ppl=5.31, wps=459920, ups=1.06, wpb=433687, bsz=16557.2, num_updates=29000, lr=0.000371391, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28553 epoch 018: 333 / 1689 loss=4.041, nll_loss=2.408, ppl=5.31, wps=459920, ups=1.06, wpb=433687, bsz=16557.2, num_updates=29000, lr=0.000371391, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28553 epoch 018: 333 / 1689 loss=4.041, nll_loss=2.408, ppl=5.31, wps=459920, ups=1.06, wpb=433687, bsz=16557.2, num_updates=29000, lr=0.000371391, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28553 epoch 018: 333 / 1689 loss=4.041, nll_loss=2.408, ppl=5.31, wps=459920, ups=1.06, wpb=433687, bsz=16557.2, num_updates=29000, lr=0.000371391, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28553 epoch 018: 333 / 1689 loss=4.041, nll_loss=2.408, ppl=5.31, wps=459920, ups=1.06, wpb=433687, bsz=16557.2, num_updates=29000, lr=0.000371391, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28553 epoch 018: 333 / 1689 loss=4.041, nll_loss=2.408, ppl=5.31, wps=459920, ups=1.06, wpb=433687, bsz=16557.2, num_updates=29000, lr=0.000371391, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28553 epoch 018: 333 / 1689 loss=4.041, nll_loss=2.408, ppl=5.31, wps=459920, ups=1.06, wpb=433687, bsz=16557.2, num_updates=29000, lr=0.000371391, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28553 epoch 018: 333 / 1689 loss=4.041, nll_loss=2.408, ppl=5.31, wps=459920, ups=1.06, wpb=433687, bsz=16557.2, num_updates=29000, lr=0.000371391, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28553 epoch 018: 333 / 1689 loss=4.041, nll_loss=2.408, ppl=5.31, wps=459920, ups=1.06, wpb=433687, bsz=16557.2, num_updates=29000, lr=0.000371391, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28553 epoch 018: 333 / 1689 loss=4.041, nll_loss=2.408, ppl=5.31, wps=459920, ups=1.06, wpb=433687, bsz=16557.2, num_updates=29000, lr=0.000371391, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28553 epoch 018: 333 / 1689 loss=4.041, nll_loss=2.408, ppl=5.31, wps=459920, ups=1.06, wpb=433687, bsz=16557.2, num_updates=29000, lr=0.000371391, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=19.1, wall=28553 begin validation on "valid" subset epoch 018 | valid on 'valid' subset | loss 4.261 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.261 epoch 018 | valid on 'valid' subset | loss 4.261 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.261 epoch 018 | valid on 'valid' subset | loss 4.261 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.261 epoch 018 | valid on 'valid' subset | loss 4.261 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.261 epoch 018 | valid on 'valid' subset | loss 4.261 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.261 epoch 018 | valid on 'valid' subset | loss 4.261 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.261 epoch 018 | valid on 'valid' subset | loss 4.261 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.261 epoch 018 | valid on 'valid' subset | loss 4.261 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.261 epoch 018 | valid on 'valid' subset | loss 4.261 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.261 epoch 018 | valid on 'valid' subset | loss 4.261 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.261 epoch 018 | valid on 'valid' subset | loss 4.261 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.261 epoch 018 | valid on 'valid' subset | loss 4.261 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.261 epoch 018 | valid on 'valid' subset | loss 4.261 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.261 epoch 018 | valid on 'valid' subset | loss 4.261 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.261 epoch 018 | valid on 'valid' subset | loss 4.261 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.261 epoch 018 | valid on 'valid' subset | loss 4.261 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.261 epoch 018 | valid on 'valid' subset | loss 4.261 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.261 epoch 018 | valid on 'valid' subset | loss 4.261 | nll_loss 2.633 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 29000 | best_loss 4.261 epoch 018: 433 / 1689 loss=4.035, nll_loss=2.402, ppl=5.29, wps=111272, ups=0.26, wpb=433017, bsz=16721, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=324, gb_free=19.8, wall=28942 epoch 018: 433 / 1689 loss=4.035, nll_loss=2.402, ppl=5.29, wps=111272, ups=0.26, wpb=433017, bsz=16721, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=324, gb_free=19.8, wall=28942 epoch 018: 433 / 1689 loss=4.035, nll_loss=2.402, ppl=5.29, wps=111272, ups=0.26, wpb=433017, bsz=16721, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=324, gb_free=19.8, wall=28942 epoch 018: 433 / 1689 loss=4.035, nll_loss=2.402, ppl=5.29, wps=111272, ups=0.26, wpb=433017, bsz=16721, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=324, gb_free=19.8, wall=28942 epoch 018: 433 / 1689 loss=4.035, nll_loss=2.402, ppl=5.29, wps=111272, ups=0.26, wpb=433017, bsz=16721, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=324, gb_free=19.8, wall=28942 epoch 018: 433 / 1689 loss=4.035, nll_loss=2.402, ppl=5.29, wps=111272, ups=0.26, wpb=433017, bsz=16721, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=324, gb_free=19.8, wall=28942 epoch 018: 433 / 1689 loss=4.035, nll_loss=2.402, ppl=5.29, wps=111272, ups=0.26, wpb=433017, bsz=16721, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=324, gb_free=19.8, wall=28942 epoch 018: 433 / 1689 loss=4.035, nll_loss=2.402, ppl=5.29, wps=111272, ups=0.26, wpb=433017, bsz=16721, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=324, gb_free=19.8, wall=28942 epoch 018: 433 / 1689 loss=4.035, nll_loss=2.402, ppl=5.29, wps=111272, ups=0.26, wpb=433017, bsz=16721, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=324, gb_free=19.8, wall=28942 epoch 018: 433 / 1689 loss=4.035, nll_loss=2.402, ppl=5.29, wps=111272, ups=0.26, wpb=433017, bsz=16721, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=324, gb_free=19.8, wall=28942 epoch 018: 433 / 1689 loss=4.035, nll_loss=2.402, ppl=5.29, wps=111272, ups=0.26, wpb=433017, bsz=16721, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=324, gb_free=19.8, wall=28942 epoch 018: 433 / 1689 loss=4.035, nll_loss=2.402, ppl=5.29, wps=111272, ups=0.26, wpb=433017, bsz=16721, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=324, gb_free=19.8, wall=28942 epoch 018: 433 / 1689 loss=4.035, nll_loss=2.402, ppl=5.29, wps=111272, ups=0.26, wpb=433017, bsz=16721, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=324, gb_free=19.8, wall=28942 epoch 018: 433 / 1689 loss=4.035, nll_loss=2.402, ppl=5.29, wps=111272, ups=0.26, wpb=433017, bsz=16721, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=324, gb_free=19.8, wall=28942 epoch 018: 433 / 1689 loss=4.035, nll_loss=2.402, ppl=5.29, wps=111272, ups=0.26, wpb=433017, bsz=16721, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=324, gb_free=19.8, wall=28942 epoch 018: 433 / 1689 loss=4.035, nll_loss=2.402, ppl=5.29, wps=111272, ups=0.26, wpb=433017, bsz=16721, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=324, gb_free=19.8, wall=28942 epoch 018: 433 / 1689 loss=4.035, nll_loss=2.402, ppl=5.29, wps=111272, ups=0.26, wpb=433017, bsz=16721, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=324, gb_free=19.8, wall=28942 epoch 018: 433 / 1689 loss=4.035, nll_loss=2.402, ppl=5.29, wps=111272, ups=0.26, wpb=433017, bsz=16721, num_updates=29100, lr=0.000370752, gnorm=0.211, clip=0, loss_scale=1, train_wall=324, gb_free=19.8, wall=28942 epoch 018: 533 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=467561, ups=1.08, wpb=431736, bsz=16477.6, num_updates=29200, lr=0.000370117, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29034 epoch 018: 533 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=467561, ups=1.08, wpb=431736, bsz=16477.6, num_updates=29200, lr=0.000370117, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29034 epoch 018: 533 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=467561, ups=1.08, wpb=431736, bsz=16477.6, num_updates=29200, lr=0.000370117, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29034 epoch 018: 533 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=467561, ups=1.08, wpb=431736, bsz=16477.6, num_updates=29200, lr=0.000370117, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29034 epoch 018: 533 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=467561, ups=1.08, wpb=431736, bsz=16477.6, num_updates=29200, lr=0.000370117, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29034 epoch 018: 533 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=467561, ups=1.08, wpb=431736, bsz=16477.6, num_updates=29200, lr=0.000370117, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29034 epoch 018: 533 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=467561, ups=1.08, wpb=431736, bsz=16477.6, num_updates=29200, lr=0.000370117, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29034 epoch 018: 533 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=467561, ups=1.08, wpb=431736, bsz=16477.6, num_updates=29200, lr=0.000370117, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29034 epoch 018: 533 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=467561, ups=1.08, wpb=431736, bsz=16477.6, num_updates=29200, lr=0.000370117, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29034 epoch 018: 533 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=467561, ups=1.08, wpb=431736, bsz=16477.6, num_updates=29200, lr=0.000370117, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29034 epoch 018: 533 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=467561, ups=1.08, wpb=431736, bsz=16477.6, num_updates=29200, lr=0.000370117, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29034 epoch 018: 533 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=467561, ups=1.08, wpb=431736, bsz=16477.6, num_updates=29200, lr=0.000370117, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29034 epoch 018: 533 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=467561, ups=1.08, wpb=431736, bsz=16477.6, num_updates=29200, lr=0.000370117, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29034 epoch 018: 533 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=467561, ups=1.08, wpb=431736, bsz=16477.6, num_updates=29200, lr=0.000370117, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29034 epoch 018: 533 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=467561, ups=1.08, wpb=431736, bsz=16477.6, num_updates=29200, lr=0.000370117, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29034 epoch 018: 533 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=467561, ups=1.08, wpb=431736, bsz=16477.6, num_updates=29200, lr=0.000370117, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29034 epoch 018: 533 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=467561, ups=1.08, wpb=431736, bsz=16477.6, num_updates=29200, lr=0.000370117, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29034 epoch 018: 533 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=467561, ups=1.08, wpb=431736, bsz=16477.6, num_updates=29200, lr=0.000370117, gnorm=0.224, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=29034 epoch 018: 633 / 1689 loss=4.036, nll_loss=2.403, ppl=5.29, wps=465764, ups=1.07, wpb=433919, bsz=16259.5, num_updates=29300, lr=0.000369484, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29128 epoch 018: 633 / 1689 loss=4.036, nll_loss=2.403, ppl=5.29, wps=465764, ups=1.07, wpb=433919, bsz=16259.5, num_updates=29300, lr=0.000369484, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29128 epoch 018: 633 / 1689 loss=4.036, nll_loss=2.403, ppl=5.29, wps=465764, ups=1.07, wpb=433919, bsz=16259.5, num_updates=29300, lr=0.000369484, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29128 epoch 018: 633 / 1689 loss=4.036, nll_loss=2.403, ppl=5.29, wps=465764, ups=1.07, wpb=433919, bsz=16259.5, num_updates=29300, lr=0.000369484, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29128 epoch 018: 633 / 1689 loss=4.036, nll_loss=2.403, ppl=5.29, wps=465764, ups=1.07, wpb=433919, bsz=16259.5, num_updates=29300, lr=0.000369484, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29128 epoch 018: 633 / 1689 loss=4.036, nll_loss=2.403, ppl=5.29, wps=465764, ups=1.07, wpb=433919, bsz=16259.5, num_updates=29300, lr=0.000369484, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29128 epoch 018: 633 / 1689 loss=4.036, nll_loss=2.403, ppl=5.29, wps=465764, ups=1.07, wpb=433919, bsz=16259.5, num_updates=29300, lr=0.000369484, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29128 epoch 018: 633 / 1689 loss=4.036, nll_loss=2.403, ppl=5.29, wps=465764, ups=1.07, wpb=433919, bsz=16259.5, num_updates=29300, lr=0.000369484, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29128 epoch 018: 633 / 1689 loss=4.036, nll_loss=2.403, ppl=5.29, wps=465764, ups=1.07, wpb=433919, bsz=16259.5, num_updates=29300, lr=0.000369484, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29128 epoch 018: 633 / 1689 loss=4.036, nll_loss=2.403, ppl=5.29, wps=465764, ups=1.07, wpb=433919, bsz=16259.5, num_updates=29300, lr=0.000369484, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29128 epoch 018: 633 / 1689 loss=4.036, nll_loss=2.403, ppl=5.29, wps=465764, ups=1.07, wpb=433919, bsz=16259.5, num_updates=29300, lr=0.000369484, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29128 epoch 018: 633 / 1689 loss=4.036, nll_loss=2.403, ppl=5.29, wps=465764, ups=1.07, wpb=433919, bsz=16259.5, num_updates=29300, lr=0.000369484, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29128 epoch 018: 633 / 1689 loss=4.036, nll_loss=2.403, ppl=5.29, wps=465764, ups=1.07, wpb=433919, bsz=16259.5, num_updates=29300, lr=0.000369484, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29128 epoch 018: 633 / 1689 loss=4.036, nll_loss=2.403, ppl=5.29, wps=465764, ups=1.07, wpb=433919, bsz=16259.5, num_updates=29300, lr=0.000369484, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29128 epoch 018: 633 / 1689 loss=4.036, nll_loss=2.403, ppl=5.29, wps=465764, ups=1.07, wpb=433919, bsz=16259.5, num_updates=29300, lr=0.000369484, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29128 epoch 018: 633 / 1689 loss=4.036, nll_loss=2.403, ppl=5.29, wps=465764, ups=1.07, wpb=433919, bsz=16259.5, num_updates=29300, lr=0.000369484, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29128 epoch 018: 633 / 1689 loss=4.036, nll_loss=2.403, ppl=5.29, wps=465764, ups=1.07, wpb=433919, bsz=16259.5, num_updates=29300, lr=0.000369484, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29128 epoch 018: 633 / 1689 loss=4.036, nll_loss=2.403, ppl=5.29, wps=465764, ups=1.07, wpb=433919, bsz=16259.5, num_updates=29300, lr=0.000369484, gnorm=0.198, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29128 epoch 018: 733 / 1689 loss=4.042, nll_loss=2.409, ppl=5.31, wps=461354, ups=1.06, wpb=434049, bsz=16590.1, num_updates=29400, lr=0.000368856, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29222 epoch 018: 733 / 1689 loss=4.042, nll_loss=2.409, ppl=5.31, wps=461354, ups=1.06, wpb=434049, bsz=16590.1, num_updates=29400, lr=0.000368856, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29222 epoch 018: 733 / 1689 loss=4.042, nll_loss=2.409, ppl=5.31, wps=461354, ups=1.06, wpb=434049, bsz=16590.1, num_updates=29400, lr=0.000368856, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29222 epoch 018: 733 / 1689 loss=4.042, nll_loss=2.409, ppl=5.31, wps=461354, ups=1.06, wpb=434049, bsz=16590.1, num_updates=29400, lr=0.000368856, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29222 epoch 018: 733 / 1689 loss=4.042, nll_loss=2.409, ppl=5.31, wps=461354, ups=1.06, wpb=434049, bsz=16590.1, num_updates=29400, lr=0.000368856, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29222 epoch 018: 733 / 1689 loss=4.042, nll_loss=2.409, ppl=5.31, wps=461354, ups=1.06, wpb=434049, bsz=16590.1, num_updates=29400, lr=0.000368856, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29222 epoch 018: 733 / 1689 loss=4.042, nll_loss=2.409, ppl=5.31, wps=461354, ups=1.06, wpb=434049, bsz=16590.1, num_updates=29400, lr=0.000368856, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29222 epoch 018: 733 / 1689 loss=4.042, nll_loss=2.409, ppl=5.31, wps=461354, ups=1.06, wpb=434049, bsz=16590.1, num_updates=29400, lr=0.000368856, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29222 epoch 018: 733 / 1689 loss=4.042, nll_loss=2.409, ppl=5.31, wps=461354, ups=1.06, wpb=434049, bsz=16590.1, num_updates=29400, lr=0.000368856, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29222 epoch 018: 733 / 1689 loss=4.042, nll_loss=2.409, ppl=5.31, wps=461354, ups=1.06, wpb=434049, bsz=16590.1, num_updates=29400, lr=0.000368856, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29222 epoch 018: 733 / 1689 loss=4.042, nll_loss=2.409, ppl=5.31, wps=461354, ups=1.06, wpb=434049, bsz=16590.1, num_updates=29400, lr=0.000368856, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29222 epoch 018: 733 / 1689 loss=4.042, nll_loss=2.409, ppl=5.31, wps=461354, ups=1.06, wpb=434049, bsz=16590.1, num_updates=29400, lr=0.000368856, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29222 epoch 018: 733 / 1689 loss=4.042, nll_loss=2.409, ppl=5.31, wps=461354, ups=1.06, wpb=434049, bsz=16590.1, num_updates=29400, lr=0.000368856, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29222 epoch 018: 733 / 1689 loss=4.042, nll_loss=2.409, ppl=5.31, wps=461354, ups=1.06, wpb=434049, bsz=16590.1, num_updates=29400, lr=0.000368856, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29222 epoch 018: 733 / 1689 loss=4.042, nll_loss=2.409, ppl=5.31, wps=461354, ups=1.06, wpb=434049, bsz=16590.1, num_updates=29400, lr=0.000368856, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29222 epoch 018: 733 / 1689 loss=4.042, nll_loss=2.409, ppl=5.31, wps=461354, ups=1.06, wpb=434049, bsz=16590.1, num_updates=29400, lr=0.000368856, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29222 epoch 018: 733 / 1689 loss=4.042, nll_loss=2.409, ppl=5.31, wps=461354, ups=1.06, wpb=434049, bsz=16590.1, num_updates=29400, lr=0.000368856, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29222 epoch 018: 733 / 1689 loss=4.042, nll_loss=2.409, ppl=5.31, wps=461354, ups=1.06, wpb=434049, bsz=16590.1, num_updates=29400, lr=0.000368856, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=29222 epoch 018: 833 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461707, ups=1.06, wpb=434482, bsz=16925, num_updates=29500, lr=0.00036823, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=29316 epoch 018: 833 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461707, ups=1.06, wpb=434482, bsz=16925, num_updates=29500, lr=0.00036823, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=29316 epoch 018: 833 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461707, ups=1.06, wpb=434482, bsz=16925, num_updates=29500, lr=0.00036823, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=29316 epoch 018: 833 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461707, ups=1.06, wpb=434482, bsz=16925, num_updates=29500, lr=0.00036823, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=29316 epoch 018: 833 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461707, ups=1.06, wpb=434482, bsz=16925, num_updates=29500, lr=0.00036823, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=29316 epoch 018: 833 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461707, ups=1.06, wpb=434482, bsz=16925, num_updates=29500, lr=0.00036823, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=29316 epoch 018: 833 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461707, ups=1.06, wpb=434482, bsz=16925, num_updates=29500, lr=0.00036823, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=29316 epoch 018: 833 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461707, ups=1.06, wpb=434482, bsz=16925, num_updates=29500, lr=0.00036823, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=29316 epoch 018: 833 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461707, ups=1.06, wpb=434482, bsz=16925, num_updates=29500, lr=0.00036823, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=29316 epoch 018: 833 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461707, ups=1.06, wpb=434482, bsz=16925, num_updates=29500, lr=0.00036823, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=29316 epoch 018: 833 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461707, ups=1.06, wpb=434482, bsz=16925, num_updates=29500, lr=0.00036823, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=29316 epoch 018: 833 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461707, ups=1.06, wpb=434482, bsz=16925, num_updates=29500, lr=0.00036823, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=29316 epoch 018: 833 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461707, ups=1.06, wpb=434482, bsz=16925, num_updates=29500, lr=0.00036823, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=29316 epoch 018: 833 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461707, ups=1.06, wpb=434482, bsz=16925, num_updates=29500, lr=0.00036823, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=29316 epoch 018: 833 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461707, ups=1.06, wpb=434482, bsz=16925, num_updates=29500, lr=0.00036823, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=29316 epoch 018: 833 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461707, ups=1.06, wpb=434482, bsz=16925, num_updates=29500, lr=0.00036823, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=29316 epoch 018: 833 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461707, ups=1.06, wpb=434482, bsz=16925, num_updates=29500, lr=0.00036823, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=29316 epoch 018: 833 / 1689 loss=4.051, nll_loss=2.42, ppl=5.35, wps=461707, ups=1.06, wpb=434482, bsz=16925, num_updates=29500, lr=0.00036823, gnorm=0.212, clip=0, loss_scale=2, train_wall=93, gb_free=19, wall=29316 epoch 018: 934 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=460002, ups=1.06, wpb=434048, bsz=16303.4, num_updates=29600, lr=0.000367607, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=29410 epoch 018: 934 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=460002, ups=1.06, wpb=434048, bsz=16303.4, num_updates=29600, lr=0.000367607, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=29410 epoch 018: 934 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=460002, ups=1.06, wpb=434048, bsz=16303.4, num_updates=29600, lr=0.000367607, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=29410 epoch 018: 934 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=460002, ups=1.06, wpb=434048, bsz=16303.4, num_updates=29600, lr=0.000367607, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=29410 epoch 018: 934 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=460002, ups=1.06, wpb=434048, bsz=16303.4, num_updates=29600, lr=0.000367607, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=29410 epoch 018: 934 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=460002, ups=1.06, wpb=434048, bsz=16303.4, num_updates=29600, lr=0.000367607, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=29410 epoch 018: 934 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=460002, ups=1.06, wpb=434048, bsz=16303.4, num_updates=29600, lr=0.000367607, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=29410 epoch 018: 934 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=460002, ups=1.06, wpb=434048, bsz=16303.4, num_updates=29600, lr=0.000367607, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=29410 epoch 018: 934 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=460002, ups=1.06, wpb=434048, bsz=16303.4, num_updates=29600, lr=0.000367607, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=29410 epoch 018: 934 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=460002, ups=1.06, wpb=434048, bsz=16303.4, num_updates=29600, lr=0.000367607, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=29410 epoch 018: 934 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=460002, ups=1.06, wpb=434048, bsz=16303.4, num_updates=29600, lr=0.000367607, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=29410 epoch 018: 934 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=460002, ups=1.06, wpb=434048, bsz=16303.4, num_updates=29600, lr=0.000367607, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=29410 epoch 018: 934 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=460002, ups=1.06, wpb=434048, bsz=16303.4, num_updates=29600, lr=0.000367607, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=29410 epoch 018: 934 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=460002, ups=1.06, wpb=434048, bsz=16303.4, num_updates=29600, lr=0.000367607, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=29410 epoch 018: 934 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=460002, ups=1.06, wpb=434048, bsz=16303.4, num_updates=29600, lr=0.000367607, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=29410 epoch 018: 934 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=460002, ups=1.06, wpb=434048, bsz=16303.4, num_updates=29600, lr=0.000367607, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=29410 epoch 018: 934 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=460002, ups=1.06, wpb=434048, bsz=16303.4, num_updates=29600, lr=0.000367607, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=29410 epoch 018: 934 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=460002, ups=1.06, wpb=434048, bsz=16303.4, num_updates=29600, lr=0.000367607, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.9, wall=29410 epoch 018: 1034 / 1689 loss=4.05, nll_loss=2.419, ppl=5.35, wps=463565, ups=1.07, wpb=434268, bsz=16275.4, num_updates=29700, lr=0.000366988, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=29504 epoch 018: 1034 / 1689 loss=4.05, nll_loss=2.419, ppl=5.35, wps=463565, ups=1.07, wpb=434268, bsz=16275.4, num_updates=29700, lr=0.000366988, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=29504 epoch 018: 1034 / 1689 loss=4.05, nll_loss=2.419, ppl=5.35, wps=463565, ups=1.07, wpb=434268, bsz=16275.4, num_updates=29700, lr=0.000366988, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=29504 epoch 018: 1034 / 1689 loss=4.05, nll_loss=2.419, ppl=5.35, wps=463565, ups=1.07, wpb=434268, bsz=16275.4, num_updates=29700, lr=0.000366988, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=29504 epoch 018: 1034 / 1689 loss=4.05, nll_loss=2.419, ppl=5.35, wps=463565, ups=1.07, wpb=434268, bsz=16275.4, num_updates=29700, lr=0.000366988, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=29504 epoch 018: 1034 / 1689 loss=4.05, nll_loss=2.419, ppl=5.35, wps=463565, ups=1.07, wpb=434268, bsz=16275.4, num_updates=29700, lr=0.000366988, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=29504 epoch 018: 1034 / 1689 loss=4.05, nll_loss=2.419, ppl=5.35, wps=463565, ups=1.07, wpb=434268, bsz=16275.4, num_updates=29700, lr=0.000366988, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=29504 epoch 018: 1034 / 1689 loss=4.05, nll_loss=2.419, ppl=5.35, wps=463565, ups=1.07, wpb=434268, bsz=16275.4, num_updates=29700, lr=0.000366988, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=29504 epoch 018: 1034 / 1689 loss=4.05, nll_loss=2.419, ppl=5.35, wps=463565, ups=1.07, wpb=434268, bsz=16275.4, num_updates=29700, lr=0.000366988, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=29504 epoch 018: 1034 / 1689 loss=4.05, nll_loss=2.419, ppl=5.35, wps=463565, ups=1.07, wpb=434268, bsz=16275.4, num_updates=29700, lr=0.000366988, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=29504 epoch 018: 1034 / 1689 loss=4.05, nll_loss=2.419, ppl=5.35, wps=463565, ups=1.07, wpb=434268, bsz=16275.4, num_updates=29700, lr=0.000366988, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=29504 epoch 018: 1034 / 1689 loss=4.05, nll_loss=2.419, ppl=5.35, wps=463565, ups=1.07, wpb=434268, bsz=16275.4, num_updates=29700, lr=0.000366988, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=29504 epoch 018: 1034 / 1689 loss=4.05, nll_loss=2.419, ppl=5.35, wps=463565, ups=1.07, wpb=434268, bsz=16275.4, num_updates=29700, lr=0.000366988, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=29504 epoch 018: 1034 / 1689 loss=4.05, nll_loss=2.419, ppl=5.35, wps=463565, ups=1.07, wpb=434268, bsz=16275.4, num_updates=29700, lr=0.000366988, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=29504 epoch 018: 1034 / 1689 loss=4.05, nll_loss=2.419, ppl=5.35, wps=463565, ups=1.07, wpb=434268, bsz=16275.4, num_updates=29700, lr=0.000366988, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=29504 epoch 018: 1034 / 1689 loss=4.05, nll_loss=2.419, ppl=5.35, wps=463565, ups=1.07, wpb=434268, bsz=16275.4, num_updates=29700, lr=0.000366988, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=29504 epoch 018: 1034 / 1689 loss=4.05, nll_loss=2.419, ppl=5.35, wps=463565, ups=1.07, wpb=434268, bsz=16275.4, num_updates=29700, lr=0.000366988, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=29504 epoch 018: 1034 / 1689 loss=4.05, nll_loss=2.419, ppl=5.35, wps=463565, ups=1.07, wpb=434268, bsz=16275.4, num_updates=29700, lr=0.000366988, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=29504 epoch 018: 1134 / 1689 loss=4.055, nll_loss=2.425, ppl=5.37, wps=463143, ups=1.06, wpb=435180, bsz=16451, num_updates=29800, lr=0.000366372, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=29598 epoch 018: 1134 / 1689 loss=4.055, nll_loss=2.425, ppl=5.37, wps=463143, ups=1.06, wpb=435180, bsz=16451, num_updates=29800, lr=0.000366372, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=29598 epoch 018: 1134 / 1689 loss=4.055, nll_loss=2.425, ppl=5.37, wps=463143, ups=1.06, wpb=435180, bsz=16451, num_updates=29800, lr=0.000366372, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=29598 epoch 018: 1134 / 1689 loss=4.055, nll_loss=2.425, ppl=5.37, wps=463143, ups=1.06, wpb=435180, bsz=16451, num_updates=29800, lr=0.000366372, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=29598 epoch 018: 1134 / 1689 loss=4.055, nll_loss=2.425, ppl=5.37, wps=463143, ups=1.06, wpb=435180, bsz=16451, num_updates=29800, lr=0.000366372, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=29598 epoch 018: 1134 / 1689 loss=4.055, nll_loss=2.425, ppl=5.37, wps=463143, ups=1.06, wpb=435180, bsz=16451, num_updates=29800, lr=0.000366372, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=29598 epoch 018: 1134 / 1689 loss=4.055, nll_loss=2.425, ppl=5.37, wps=463143, ups=1.06, wpb=435180, bsz=16451, num_updates=29800, lr=0.000366372, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=29598 epoch 018: 1134 / 1689 loss=4.055, nll_loss=2.425, ppl=5.37, wps=463143, ups=1.06, wpb=435180, bsz=16451, num_updates=29800, lr=0.000366372, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=29598 epoch 018: 1134 / 1689 loss=4.055, nll_loss=2.425, ppl=5.37, wps=463143, ups=1.06, wpb=435180, bsz=16451, num_updates=29800, lr=0.000366372, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=29598 epoch 018: 1134 / 1689 loss=4.055, nll_loss=2.425, ppl=5.37, wps=463143, ups=1.06, wpb=435180, bsz=16451, num_updates=29800, lr=0.000366372, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=29598 epoch 018: 1134 / 1689 loss=4.055, nll_loss=2.425, ppl=5.37, wps=463143, ups=1.06, wpb=435180, bsz=16451, num_updates=29800, lr=0.000366372, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=29598 epoch 018: 1134 / 1689 loss=4.055, nll_loss=2.425, ppl=5.37, wps=463143, ups=1.06, wpb=435180, bsz=16451, num_updates=29800, lr=0.000366372, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=29598 epoch 018: 1134 / 1689 loss=4.055, nll_loss=2.425, ppl=5.37, wps=463143, ups=1.06, wpb=435180, bsz=16451, num_updates=29800, lr=0.000366372, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=29598 epoch 018: 1134 / 1689 loss=4.055, nll_loss=2.425, ppl=5.37, wps=463143, ups=1.06, wpb=435180, bsz=16451, num_updates=29800, lr=0.000366372, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=29598 epoch 018: 1134 / 1689 loss=4.055, nll_loss=2.425, ppl=5.37, wps=463143, ups=1.06, wpb=435180, bsz=16451, num_updates=29800, lr=0.000366372, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=29598 epoch 018: 1134 / 1689 loss=4.055, nll_loss=2.425, ppl=5.37, wps=463143, ups=1.06, wpb=435180, bsz=16451, num_updates=29800, lr=0.000366372, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=29598 epoch 018: 1134 / 1689 loss=4.055, nll_loss=2.425, ppl=5.37, wps=463143, ups=1.06, wpb=435180, bsz=16451, num_updates=29800, lr=0.000366372, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=29598 epoch 018: 1134 / 1689 loss=4.055, nll_loss=2.425, ppl=5.37, wps=463143, ups=1.06, wpb=435180, bsz=16451, num_updates=29800, lr=0.000366372, gnorm=0.21, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=29598 epoch 018: 1234 / 1689 loss=4.046, nll_loss=2.415, ppl=5.33, wps=458135, ups=1.06, wpb=432168, bsz=16638.2, num_updates=29900, lr=0.000365758, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=29692 epoch 018: 1234 / 1689 loss=4.046, nll_loss=2.415, ppl=5.33, wps=458135, ups=1.06, wpb=432168, bsz=16638.2, num_updates=29900, lr=0.000365758, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=29692 epoch 018: 1234 / 1689 loss=4.046, nll_loss=2.415, ppl=5.33, wps=458135, ups=1.06, wpb=432168, bsz=16638.2, num_updates=29900, lr=0.000365758, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=29692 epoch 018: 1234 / 1689 loss=4.046, nll_loss=2.415, ppl=5.33, wps=458135, ups=1.06, wpb=432168, bsz=16638.2, num_updates=29900, lr=0.000365758, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=29692 epoch 018: 1234 / 1689 loss=4.046, nll_loss=2.415, ppl=5.33, wps=458135, ups=1.06, wpb=432168, bsz=16638.2, num_updates=29900, lr=0.000365758, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=29692 epoch 018: 1234 / 1689 loss=4.046, nll_loss=2.415, ppl=5.33, wps=458135, ups=1.06, wpb=432168, bsz=16638.2, num_updates=29900, lr=0.000365758, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=29692 epoch 018: 1234 / 1689 loss=4.046, nll_loss=2.415, ppl=5.33, wps=458135, ups=1.06, wpb=432168, bsz=16638.2, num_updates=29900, lr=0.000365758, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=29692 epoch 018: 1234 / 1689 loss=4.046, nll_loss=2.415, ppl=5.33, wps=458135, ups=1.06, wpb=432168, bsz=16638.2, num_updates=29900, lr=0.000365758, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=29692 epoch 018: 1234 / 1689 loss=4.046, nll_loss=2.415, ppl=5.33, wps=458135, ups=1.06, wpb=432168, bsz=16638.2, num_updates=29900, lr=0.000365758, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=29692 epoch 018: 1234 / 1689 loss=4.046, nll_loss=2.415, ppl=5.33, wps=458135, ups=1.06, wpb=432168, bsz=16638.2, num_updates=29900, lr=0.000365758, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=29692 epoch 018: 1234 / 1689 loss=4.046, nll_loss=2.415, ppl=5.33, wps=458135, ups=1.06, wpb=432168, bsz=16638.2, num_updates=29900, lr=0.000365758, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=29692 epoch 018: 1234 / 1689 loss=4.046, nll_loss=2.415, ppl=5.33, wps=458135, ups=1.06, wpb=432168, bsz=16638.2, num_updates=29900, lr=0.000365758, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=29692 epoch 018: 1234 / 1689 loss=4.046, nll_loss=2.415, ppl=5.33, wps=458135, ups=1.06, wpb=432168, bsz=16638.2, num_updates=29900, lr=0.000365758, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=29692 epoch 018: 1234 / 1689 loss=4.046, nll_loss=2.415, ppl=5.33, wps=458135, ups=1.06, wpb=432168, bsz=16638.2, num_updates=29900, lr=0.000365758, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=29692 epoch 018: 1234 / 1689 loss=4.046, nll_loss=2.415, ppl=5.33, wps=458135, ups=1.06, wpb=432168, bsz=16638.2, num_updates=29900, lr=0.000365758, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=29692 epoch 018: 1234 / 1689 loss=4.046, nll_loss=2.415, ppl=5.33, wps=458135, ups=1.06, wpb=432168, bsz=16638.2, num_updates=29900, lr=0.000365758, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=29692 epoch 018: 1234 / 1689 loss=4.046, nll_loss=2.415, ppl=5.33, wps=458135, ups=1.06, wpb=432168, bsz=16638.2, num_updates=29900, lr=0.000365758, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=29692 epoch 018: 1234 / 1689 loss=4.046, nll_loss=2.415, ppl=5.33, wps=458135, ups=1.06, wpb=432168, bsz=16638.2, num_updates=29900, lr=0.000365758, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=29692 epoch 018: 1334 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=462267, ups=1.06, wpb=435996, bsz=16319.5, num_updates=30000, lr=0.000365148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=29786 epoch 018: 1334 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=462267, ups=1.06, wpb=435996, bsz=16319.5, num_updates=30000, lr=0.000365148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=29786 epoch 018: 1334 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=462267, ups=1.06, wpb=435996, bsz=16319.5, num_updates=30000, lr=0.000365148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=29786 epoch 018: 1334 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=462267, ups=1.06, wpb=435996, bsz=16319.5, num_updates=30000, lr=0.000365148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=29786 epoch 018: 1334 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=462267, ups=1.06, wpb=435996, bsz=16319.5, num_updates=30000, lr=0.000365148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=29786 epoch 018: 1334 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=462267, ups=1.06, wpb=435996, bsz=16319.5, num_updates=30000, lr=0.000365148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=29786 epoch 018: 1334 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=462267, ups=1.06, wpb=435996, bsz=16319.5, num_updates=30000, lr=0.000365148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=29786 epoch 018: 1334 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=462267, ups=1.06, wpb=435996, bsz=16319.5, num_updates=30000, lr=0.000365148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=29786 epoch 018: 1334 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=462267, ups=1.06, wpb=435996, bsz=16319.5, num_updates=30000, lr=0.000365148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=29786 epoch 018: 1334 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=462267, ups=1.06, wpb=435996, bsz=16319.5, num_updates=30000, lr=0.000365148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=29786 epoch 018: 1334 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=462267, ups=1.06, wpb=435996, bsz=16319.5, num_updates=30000, lr=0.000365148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=29786 epoch 018: 1334 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=462267, ups=1.06, wpb=435996, bsz=16319.5, num_updates=30000, lr=0.000365148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=29786 epoch 018: 1334 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=462267, ups=1.06, wpb=435996, bsz=16319.5, num_updates=30000, lr=0.000365148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=29786 epoch 018: 1334 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=462267, ups=1.06, wpb=435996, bsz=16319.5, num_updates=30000, lr=0.000365148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=29786 epoch 018: 1334 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=462267, ups=1.06, wpb=435996, bsz=16319.5, num_updates=30000, lr=0.000365148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=29786 epoch 018: 1334 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=462267, ups=1.06, wpb=435996, bsz=16319.5, num_updates=30000, lr=0.000365148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=29786 epoch 018: 1334 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=462267, ups=1.06, wpb=435996, bsz=16319.5, num_updates=30000, lr=0.000365148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=29786 epoch 018: 1334 / 1689 loss=4.052, nll_loss=2.422, ppl=5.36, wps=462267, ups=1.06, wpb=435996, bsz=16319.5, num_updates=30000, lr=0.000365148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=19.2, wall=29786 begin validation on "valid" subset epoch 018 | valid on 'valid' subset | loss 4.259 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.259 epoch 018 | valid on 'valid' subset | loss 4.259 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.259 epoch 018 | valid on 'valid' subset | loss 4.259 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.259 epoch 018 | valid on 'valid' subset | loss 4.259 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.259 epoch 018 | valid on 'valid' subset | loss 4.259 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.259 epoch 018 | valid on 'valid' subset | loss 4.259 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.259 epoch 018 | valid on 'valid' subset | loss 4.259 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.259 epoch 018 | valid on 'valid' subset | loss 4.259 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.259 epoch 018 | valid on 'valid' subset | loss 4.259 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.259 epoch 018 | valid on 'valid' subset | loss 4.259 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.259 epoch 018 | valid on 'valid' subset | loss 4.259 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.259 epoch 018 | valid on 'valid' subset | loss 4.259 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.259 epoch 018 | valid on 'valid' subset | loss 4.259 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.259 epoch 018 | valid on 'valid' subset | loss 4.259 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.259 epoch 018 | valid on 'valid' subset | loss 4.259 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.259 epoch 018 | valid on 'valid' subset | loss 4.259 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.259 epoch 018 | valid on 'valid' subset | loss 4.259 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.259 epoch 018 | valid on 'valid' subset | loss 4.259 | nll_loss 2.645 | ppl 6.25 | wps 0 | wpb 42662 | bsz 2032 | num_updates 30000 | best_loss 4.259 epoch 018: 1435 / 1689 loss=4.055, nll_loss=2.424, ppl=5.37, wps=289232, ups=0.67, wpb=434357, bsz=16330.7, num_updates=30100, lr=0.000364541, gnorm=0.238, clip=0, loss_scale=1, train_wall=124, gb_free=18.9, wall=29937 epoch 018: 1435 / 1689 loss=4.055, nll_loss=2.424, ppl=5.37, wps=289232, ups=0.67, wpb=434357, bsz=16330.7, num_updates=30100, lr=0.000364541, gnorm=0.238, clip=0, loss_scale=1, train_wall=124, gb_free=18.9, wall=29937 epoch 018: 1435 / 1689 loss=4.055, nll_loss=2.424, ppl=5.37, wps=289232, ups=0.67, wpb=434357, bsz=16330.7, num_updates=30100, lr=0.000364541, gnorm=0.238, clip=0, loss_scale=1, train_wall=124, gb_free=18.9, wall=29937 epoch 018: 1435 / 1689 loss=4.055, nll_loss=2.424, ppl=5.37, wps=289232, ups=0.67, wpb=434357, bsz=16330.7, num_updates=30100, lr=0.000364541, gnorm=0.238, clip=0, loss_scale=1, train_wall=124, gb_free=18.9, wall=29937 epoch 018: 1435 / 1689 loss=4.055, nll_loss=2.424, ppl=5.37, wps=289232, ups=0.67, wpb=434357, bsz=16330.7, num_updates=30100, lr=0.000364541, gnorm=0.238, clip=0, loss_scale=1, train_wall=124, gb_free=18.9, wall=29937 epoch 018: 1435 / 1689 loss=4.055, nll_loss=2.424, ppl=5.37, wps=289232, ups=0.67, wpb=434357, bsz=16330.7, num_updates=30100, lr=0.000364541, gnorm=0.238, clip=0, loss_scale=1, train_wall=124, gb_free=18.9, wall=29937 epoch 018: 1435 / 1689 loss=4.055, nll_loss=2.424, ppl=5.37, wps=289232, ups=0.67, wpb=434357, bsz=16330.7, num_updates=30100, lr=0.000364541, gnorm=0.238, clip=0, loss_scale=1, train_wall=124, gb_free=18.9, wall=29937 epoch 018: 1435 / 1689 loss=4.055, nll_loss=2.424, ppl=5.37, wps=289232, ups=0.67, wpb=434357, bsz=16330.7, num_updates=30100, lr=0.000364541, gnorm=0.238, clip=0, loss_scale=1, train_wall=124, gb_free=18.9, wall=29937 epoch 018: 1435 / 1689 loss=4.055, nll_loss=2.424, ppl=5.37, wps=289232, ups=0.67, wpb=434357, bsz=16330.7, num_updates=30100, lr=0.000364541, gnorm=0.238, clip=0, loss_scale=1, train_wall=124, gb_free=18.9, wall=29937 epoch 018: 1435 / 1689 loss=4.055, nll_loss=2.424, ppl=5.37, wps=289232, ups=0.67, wpb=434357, bsz=16330.7, num_updates=30100, lr=0.000364541, gnorm=0.238, clip=0, loss_scale=1, train_wall=124, gb_free=18.9, wall=29937 epoch 018: 1435 / 1689 loss=4.055, nll_loss=2.424, ppl=5.37, wps=289232, ups=0.67, wpb=434357, bsz=16330.7, num_updates=30100, lr=0.000364541, gnorm=0.238, clip=0, loss_scale=1, train_wall=124, gb_free=18.9, wall=29937 epoch 018: 1435 / 1689 loss=4.055, nll_loss=2.424, ppl=5.37, wps=289232, ups=0.67, wpb=434357, bsz=16330.7, num_updates=30100, lr=0.000364541, gnorm=0.238, clip=0, loss_scale=1, train_wall=124, gb_free=18.9, wall=29937 epoch 018: 1435 / 1689 loss=4.055, nll_loss=2.424, ppl=5.37, wps=289232, ups=0.67, wpb=434357, bsz=16330.7, num_updates=30100, lr=0.000364541, gnorm=0.238, clip=0, loss_scale=1, train_wall=124, gb_free=18.9, wall=29937 epoch 018: 1435 / 1689 loss=4.055, nll_loss=2.424, ppl=5.37, wps=289232, ups=0.67, wpb=434357, bsz=16330.7, num_updates=30100, lr=0.000364541, gnorm=0.238, clip=0, loss_scale=1, train_wall=124, gb_free=18.9, wall=29937 epoch 018: 1435 / 1689 loss=4.055, nll_loss=2.424, ppl=5.37, wps=289232, ups=0.67, wpb=434357, bsz=16330.7, num_updates=30100, lr=0.000364541, gnorm=0.238, clip=0, loss_scale=1, train_wall=124, gb_free=18.9, wall=29937 epoch 018: 1435 / 1689 loss=4.055, nll_loss=2.424, ppl=5.37, wps=289232, ups=0.67, wpb=434357, bsz=16330.7, num_updates=30100, lr=0.000364541, gnorm=0.238, clip=0, loss_scale=1, train_wall=124, gb_free=18.9, wall=29937 epoch 018: 1435 / 1689 loss=4.055, nll_loss=2.424, ppl=5.37, wps=289232, ups=0.67, wpb=434357, bsz=16330.7, num_updates=30100, lr=0.000364541, gnorm=0.238, clip=0, loss_scale=1, train_wall=124, gb_free=18.9, wall=29937 epoch 018: 1435 / 1689 loss=4.055, nll_loss=2.424, ppl=5.37, wps=289232, ups=0.67, wpb=434357, bsz=16330.7, num_updates=30100, lr=0.000364541, gnorm=0.238, clip=0, loss_scale=1, train_wall=124, gb_free=18.9, wall=29937 epoch 018: 1535 / 1689 loss=4.042, nll_loss=2.41, ppl=5.32, wps=457245, ups=1.06, wpb=431254, bsz=16372.8, num_updates=30200, lr=0.000363937, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=30031 epoch 018: 1535 / 1689 loss=4.042, nll_loss=2.41, ppl=5.32, wps=457245, ups=1.06, wpb=431254, bsz=16372.8, num_updates=30200, lr=0.000363937, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=30031 epoch 018: 1535 / 1689 loss=4.042, nll_loss=2.41, ppl=5.32, wps=457245, ups=1.06, wpb=431254, bsz=16372.8, num_updates=30200, lr=0.000363937, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=30031 epoch 018: 1535 / 1689 loss=4.042, nll_loss=2.41, ppl=5.32, wps=457245, ups=1.06, wpb=431254, bsz=16372.8, num_updates=30200, lr=0.000363937, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=30031 epoch 018: 1535 / 1689 loss=4.042, nll_loss=2.41, ppl=5.32, wps=457245, ups=1.06, wpb=431254, bsz=16372.8, num_updates=30200, lr=0.000363937, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=30031 epoch 018: 1535 / 1689 loss=4.042, nll_loss=2.41, ppl=5.32, wps=457245, ups=1.06, wpb=431254, bsz=16372.8, num_updates=30200, lr=0.000363937, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=30031 epoch 018: 1535 / 1689 loss=4.042, nll_loss=2.41, ppl=5.32, wps=457245, ups=1.06, wpb=431254, bsz=16372.8, num_updates=30200, lr=0.000363937, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=30031 epoch 018: 1535 / 1689 loss=4.042, nll_loss=2.41, ppl=5.32, wps=457245, ups=1.06, wpb=431254, bsz=16372.8, num_updates=30200, lr=0.000363937, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=30031 epoch 018: 1535 / 1689 loss=4.042, nll_loss=2.41, ppl=5.32, wps=457245, ups=1.06, wpb=431254, bsz=16372.8, num_updates=30200, lr=0.000363937, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=30031 epoch 018: 1535 / 1689 loss=4.042, nll_loss=2.41, ppl=5.32, wps=457245, ups=1.06, wpb=431254, bsz=16372.8, num_updates=30200, lr=0.000363937, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=30031 epoch 018: 1535 / 1689 loss=4.042, nll_loss=2.41, ppl=5.32, wps=457245, ups=1.06, wpb=431254, bsz=16372.8, num_updates=30200, lr=0.000363937, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=30031 epoch 018: 1535 / 1689 loss=4.042, nll_loss=2.41, ppl=5.32, wps=457245, ups=1.06, wpb=431254, bsz=16372.8, num_updates=30200, lr=0.000363937, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=30031 epoch 018: 1535 / 1689 loss=4.042, nll_loss=2.41, ppl=5.32, wps=457245, ups=1.06, wpb=431254, bsz=16372.8, num_updates=30200, lr=0.000363937, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=30031 epoch 018: 1535 / 1689 loss=4.042, nll_loss=2.41, ppl=5.32, wps=457245, ups=1.06, wpb=431254, bsz=16372.8, num_updates=30200, lr=0.000363937, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=30031 epoch 018: 1535 / 1689 loss=4.042, nll_loss=2.41, ppl=5.32, wps=457245, ups=1.06, wpb=431254, bsz=16372.8, num_updates=30200, lr=0.000363937, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=30031 epoch 018: 1535 / 1689 loss=4.042, nll_loss=2.41, ppl=5.32, wps=457245, ups=1.06, wpb=431254, bsz=16372.8, num_updates=30200, lr=0.000363937, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=30031 epoch 018: 1535 / 1689 loss=4.042, nll_loss=2.41, ppl=5.32, wps=457245, ups=1.06, wpb=431254, bsz=16372.8, num_updates=30200, lr=0.000363937, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=30031 epoch 018: 1535 / 1689 loss=4.042, nll_loss=2.41, ppl=5.32, wps=457245, ups=1.06, wpb=431254, bsz=16372.8, num_updates=30200, lr=0.000363937, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=30031 epoch 018: 1635 / 1689 loss=4.045, nll_loss=2.414, ppl=5.33, wps=458641, ups=1.07, wpb=430350, bsz=16923.5, num_updates=30300, lr=0.000363336, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=30125 epoch 018: 1635 / 1689 loss=4.045, nll_loss=2.414, ppl=5.33, wps=458641, ups=1.07, wpb=430350, bsz=16923.5, num_updates=30300, lr=0.000363336, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=30125 epoch 018: 1635 / 1689 loss=4.045, nll_loss=2.414, ppl=5.33, wps=458641, ups=1.07, wpb=430350, bsz=16923.5, num_updates=30300, lr=0.000363336, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=30125 epoch 018: 1635 / 1689 loss=4.045, nll_loss=2.414, ppl=5.33, wps=458641, ups=1.07, wpb=430350, bsz=16923.5, num_updates=30300, lr=0.000363336, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=30125 epoch 018: 1635 / 1689 loss=4.045, nll_loss=2.414, ppl=5.33, wps=458641, ups=1.07, wpb=430350, bsz=16923.5, num_updates=30300, lr=0.000363336, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=30125 epoch 018: 1635 / 1689 loss=4.045, nll_loss=2.414, ppl=5.33, wps=458641, ups=1.07, wpb=430350, bsz=16923.5, num_updates=30300, lr=0.000363336, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=30125 epoch 018: 1635 / 1689 loss=4.045, nll_loss=2.414, ppl=5.33, wps=458641, ups=1.07, wpb=430350, bsz=16923.5, num_updates=30300, lr=0.000363336, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=30125 epoch 018: 1635 / 1689 loss=4.045, nll_loss=2.414, ppl=5.33, wps=458641, ups=1.07, wpb=430350, bsz=16923.5, num_updates=30300, lr=0.000363336, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=30125 epoch 018: 1635 / 1689 loss=4.045, nll_loss=2.414, ppl=5.33, wps=458641, ups=1.07, wpb=430350, bsz=16923.5, num_updates=30300, lr=0.000363336, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=30125 epoch 018: 1635 / 1689 loss=4.045, nll_loss=2.414, ppl=5.33, wps=458641, ups=1.07, wpb=430350, bsz=16923.5, num_updates=30300, lr=0.000363336, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=30125 epoch 018: 1635 / 1689 loss=4.045, nll_loss=2.414, ppl=5.33, wps=458641, ups=1.07, wpb=430350, bsz=16923.5, num_updates=30300, lr=0.000363336, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=30125 epoch 018: 1635 / 1689 loss=4.045, nll_loss=2.414, ppl=5.33, wps=458641, ups=1.07, wpb=430350, bsz=16923.5, num_updates=30300, lr=0.000363336, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=30125 epoch 018: 1635 / 1689 loss=4.045, nll_loss=2.414, ppl=5.33, wps=458641, ups=1.07, wpb=430350, bsz=16923.5, num_updates=30300, lr=0.000363336, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=30125 epoch 018: 1635 / 1689 loss=4.045, nll_loss=2.414, ppl=5.33, wps=458641, ups=1.07, wpb=430350, bsz=16923.5, num_updates=30300, lr=0.000363336, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=30125 epoch 018: 1635 / 1689 loss=4.045, nll_loss=2.414, ppl=5.33, wps=458641, ups=1.07, wpb=430350, bsz=16923.5, num_updates=30300, lr=0.000363336, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=30125 epoch 018: 1635 / 1689 loss=4.045, nll_loss=2.414, ppl=5.33, wps=458641, ups=1.07, wpb=430350, bsz=16923.5, num_updates=30300, lr=0.000363336, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=30125 epoch 018: 1635 / 1689 loss=4.045, nll_loss=2.414, ppl=5.33, wps=458641, ups=1.07, wpb=430350, bsz=16923.5, num_updates=30300, lr=0.000363336, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=30125 epoch 018: 1635 / 1689 loss=4.045, nll_loss=2.414, ppl=5.33, wps=458641, ups=1.07, wpb=430350, bsz=16923.5, num_updates=30300, lr=0.000363336, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=30125 end of epoch 18 (average epoch stats below) epoch 018 | loss 4.042 | nll_loss 2.41 | ppl 5.31 | wps 377534 | ups 0.87 | wpb 433543 | bsz 16506.2 | num_updates 30354 | lr 0.000363013 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1826 | gb_free 21.2 | wall 30175 epoch 018 | loss 4.042 | nll_loss 2.41 | ppl 5.31 | wps 377534 | ups 0.87 | wpb 433543 | bsz 16506.2 | num_updates 30354 | lr 0.000363013 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1826 | gb_free 21.2 | wall 30175 epoch 018 | loss 4.042 | nll_loss 2.41 | ppl 5.31 | wps 377534 | ups 0.87 | wpb 433543 | bsz 16506.2 | num_updates 30354 | lr 0.000363013 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1826 | gb_free 21.2 | wall 30175 epoch 018 | loss 4.042 | nll_loss 2.41 | ppl 5.31 | wps 377534 | ups 0.87 | wpb 433543 | bsz 16506.2 | num_updates 30354 | lr 0.000363013 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1826 | gb_free 21.2 | wall 30175 epoch 018 | loss 4.042 | nll_loss 2.41 | ppl 5.31 | wps 377534 | ups 0.87 | wpb 433543 | bsz 16506.2 | num_updates 30354 | lr 0.000363013 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1826 | gb_free 21.2 | wall 30175 epoch 018 | loss 4.042 | nll_loss 2.41 | ppl 5.31 | wps 377534 | ups 0.87 | wpb 433543 | bsz 16506.2 | num_updates 30354 | lr 0.000363013 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1826 | gb_free 21.2 | wall 30175 epoch 018 | loss 4.042 | nll_loss 2.41 | ppl 5.31 | wps 377534 | ups 0.87 | wpb 433543 | bsz 16506.2 | num_updates 30354 | lr 0.000363013 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1826 | gb_free 21.2 | wall 30175 epoch 018 | loss 4.042 | nll_loss 2.41 | ppl 5.31 | wps 377534 | ups 0.87 | wpb 433543 | bsz 16506.2 | num_updates 30354 | lr 0.000363013 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1826 | gb_free 21.2 | wall 30175 epoch 018 | loss 4.042 | nll_loss 2.41 | ppl 5.31 | wps 377534 | ups 0.87 | wpb 433543 | bsz 16506.2 | num_updates 30354 | lr 0.000363013 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1826 | gb_free 21.2 | wall 30175 epoch 018 | loss 4.042 | nll_loss 2.41 | ppl 5.31 | wps 377534 | ups 0.87 | wpb 433543 | bsz 16506.2 | num_updates 30354 | lr 0.000363013 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1826 | gb_free 21.2 | wall 30175 epoch 018 | loss 4.042 | nll_loss 2.41 | ppl 5.31 | wps 377534 | ups 0.87 | wpb 433543 | bsz 16506.2 | num_updates 30354 | lr 0.000363013 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1826 | gb_free 21.2 | wall 30175 epoch 018 | loss 4.042 | nll_loss 2.41 | ppl 5.31 | wps 377534 | ups 0.87 | wpb 433543 | bsz 16506.2 | num_updates 30354 | lr 0.000363013 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1826 | gb_free 21.2 | wall 30175 epoch 018 | loss 4.042 | nll_loss 2.41 | ppl 5.31 | wps 377534 | ups 0.87 | wpb 433543 | bsz 16506.2 | num_updates 30354 | lr 0.000363013 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1826 | gb_free 21.2 | wall 30175 epoch 018 | loss 4.042 | nll_loss 2.41 | ppl 5.31 | wps 377534 | ups 0.87 | wpb 433543 | bsz 16506.2 | num_updates 30354 | lr 0.000363013 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1826 | gb_free 21.2 | wall 30175 epoch 018 | loss 4.042 | nll_loss 2.41 | ppl 5.31 | wps 377534 | ups 0.87 | wpb 433543 | bsz 16506.2 | num_updates 30354 | lr 0.000363013 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1826 | gb_free 21.2 | wall 30175 epoch 018 | loss 4.042 | nll_loss 2.41 | ppl 5.31 | wps 377534 | ups 0.87 | wpb 433543 | bsz 16506.2 | num_updates 30354 | lr 0.000363013 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1826 | gb_free 21.2 | wall 30175 epoch 018 | loss 4.042 | nll_loss 2.41 | ppl 5.31 | wps 377534 | ups 0.87 | wpb 433543 | bsz 16506.2 | num_updates 30354 | lr 0.000363013 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1826 | gb_free 21.2 | wall 30175 epoch 018 | loss 4.042 | nll_loss 2.41 | ppl 5.31 | wps 377534 | ups 0.87 | wpb 433543 | bsz 16506.2 | num_updates 30354 | lr 0.000363013 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1826 | gb_free 21.2 | wall 30175 Start iterating over samples epoch 019: 46 / 1689 loss=4.04, nll_loss=2.407, ppl=5.31, wps=457002, ups=1.06, wpb=431081, bsz=16596.2, num_updates=30400, lr=0.000362738, gnorm=0.205, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30219 epoch 019: 46 / 1689 loss=4.04, nll_loss=2.407, ppl=5.31, wps=457002, ups=1.06, wpb=431081, bsz=16596.2, num_updates=30400, lr=0.000362738, gnorm=0.205, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30219 epoch 019: 46 / 1689 loss=4.04, nll_loss=2.407, ppl=5.31, wps=457002, ups=1.06, wpb=431081, bsz=16596.2, num_updates=30400, lr=0.000362738, gnorm=0.205, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30219 epoch 019: 46 / 1689 loss=4.04, nll_loss=2.407, ppl=5.31, wps=457002, ups=1.06, wpb=431081, bsz=16596.2, num_updates=30400, lr=0.000362738, gnorm=0.205, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30219 epoch 019: 46 / 1689 loss=4.04, nll_loss=2.407, ppl=5.31, wps=457002, ups=1.06, wpb=431081, bsz=16596.2, num_updates=30400, lr=0.000362738, gnorm=0.205, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30219 epoch 019: 46 / 1689 loss=4.04, nll_loss=2.407, ppl=5.31, wps=457002, ups=1.06, wpb=431081, bsz=16596.2, num_updates=30400, lr=0.000362738, gnorm=0.205, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30219 epoch 019: 46 / 1689 loss=4.04, nll_loss=2.407, ppl=5.31, wps=457002, ups=1.06, wpb=431081, bsz=16596.2, num_updates=30400, lr=0.000362738, gnorm=0.205, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30219 epoch 019: 46 / 1689 loss=4.04, nll_loss=2.407, ppl=5.31, wps=457002, ups=1.06, wpb=431081, bsz=16596.2, num_updates=30400, lr=0.000362738, gnorm=0.205, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30219 epoch 019: 46 / 1689 loss=4.04, nll_loss=2.407, ppl=5.31, wps=457002, ups=1.06, wpb=431081, bsz=16596.2, num_updates=30400, lr=0.000362738, gnorm=0.205, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30219 epoch 019: 46 / 1689 loss=4.04, nll_loss=2.407, ppl=5.31, wps=457002, ups=1.06, wpb=431081, bsz=16596.2, num_updates=30400, lr=0.000362738, gnorm=0.205, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30219 epoch 019: 46 / 1689 loss=4.04, nll_loss=2.407, ppl=5.31, wps=457002, ups=1.06, wpb=431081, bsz=16596.2, num_updates=30400, lr=0.000362738, gnorm=0.205, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30219 epoch 019: 46 / 1689 loss=4.04, nll_loss=2.407, ppl=5.31, wps=457002, ups=1.06, wpb=431081, bsz=16596.2, num_updates=30400, lr=0.000362738, gnorm=0.205, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30219 epoch 019: 46 / 1689 loss=4.04, nll_loss=2.407, ppl=5.31, wps=457002, ups=1.06, wpb=431081, bsz=16596.2, num_updates=30400, lr=0.000362738, gnorm=0.205, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30219 epoch 019: 46 / 1689 loss=4.04, nll_loss=2.407, ppl=5.31, wps=457002, ups=1.06, wpb=431081, bsz=16596.2, num_updates=30400, lr=0.000362738, gnorm=0.205, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30219 epoch 019: 46 / 1689 loss=4.04, nll_loss=2.407, ppl=5.31, wps=457002, ups=1.06, wpb=431081, bsz=16596.2, num_updates=30400, lr=0.000362738, gnorm=0.205, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30219 epoch 019: 46 / 1689 loss=4.04, nll_loss=2.407, ppl=5.31, wps=457002, ups=1.06, wpb=431081, bsz=16596.2, num_updates=30400, lr=0.000362738, gnorm=0.205, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30219 epoch 019: 46 / 1689 loss=4.04, nll_loss=2.407, ppl=5.31, wps=457002, ups=1.06, wpb=431081, bsz=16596.2, num_updates=30400, lr=0.000362738, gnorm=0.205, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30219 epoch 019: 46 / 1689 loss=4.04, nll_loss=2.407, ppl=5.31, wps=457002, ups=1.06, wpb=431081, bsz=16596.2, num_updates=30400, lr=0.000362738, gnorm=0.205, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30219 epoch 019: 46 / 1689 loss=4.04, nll_loss=2.407, ppl=5.31, wps=457002, ups=1.06, wpb=431081, bsz=16596.2, num_updates=30400, lr=0.000362738, gnorm=0.205, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30219 epoch 019: 146 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=455394, ups=1.05, wpb=432478, bsz=17014.6, num_updates=30500, lr=0.000362143, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=30314 epoch 019: 146 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=455394, ups=1.05, wpb=432478, bsz=17014.6, num_updates=30500, lr=0.000362143, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=30314 epoch 019: 146 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=455394, ups=1.05, wpb=432478, bsz=17014.6, num_updates=30500, lr=0.000362143, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=30314 epoch 019: 146 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=455394, ups=1.05, wpb=432478, bsz=17014.6, num_updates=30500, lr=0.000362143, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=30314 epoch 019: 146 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=455394, ups=1.05, wpb=432478, bsz=17014.6, num_updates=30500, lr=0.000362143, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=30314 epoch 019: 146 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=455394, ups=1.05, wpb=432478, bsz=17014.6, num_updates=30500, lr=0.000362143, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=30314 epoch 019: 146 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=455394, ups=1.05, wpb=432478, bsz=17014.6, num_updates=30500, lr=0.000362143, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=30314 epoch 019: 146 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=455394, ups=1.05, wpb=432478, bsz=17014.6, num_updates=30500, lr=0.000362143, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=30314 epoch 019: 146 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=455394, ups=1.05, wpb=432478, bsz=17014.6, num_updates=30500, lr=0.000362143, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=30314 epoch 019: 146 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=455394, ups=1.05, wpb=432478, bsz=17014.6, num_updates=30500, lr=0.000362143, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=30314 epoch 019: 146 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=455394, ups=1.05, wpb=432478, bsz=17014.6, num_updates=30500, lr=0.000362143, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=30314 epoch 019: 146 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=455394, ups=1.05, wpb=432478, bsz=17014.6, num_updates=30500, lr=0.000362143, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=30314 epoch 019: 146 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=455394, ups=1.05, wpb=432478, bsz=17014.6, num_updates=30500, lr=0.000362143, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=30314 epoch 019: 146 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=455394, ups=1.05, wpb=432478, bsz=17014.6, num_updates=30500, lr=0.000362143, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=30314 epoch 019: 146 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=455394, ups=1.05, wpb=432478, bsz=17014.6, num_updates=30500, lr=0.000362143, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=30314 epoch 019: 146 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=455394, ups=1.05, wpb=432478, bsz=17014.6, num_updates=30500, lr=0.000362143, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=30314 epoch 019: 146 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=455394, ups=1.05, wpb=432478, bsz=17014.6, num_updates=30500, lr=0.000362143, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=30314 epoch 019: 146 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=455394, ups=1.05, wpb=432478, bsz=17014.6, num_updates=30500, lr=0.000362143, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=30314 epoch 019: 146 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=455394, ups=1.05, wpb=432478, bsz=17014.6, num_updates=30500, lr=0.000362143, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=30314 epoch 019: 247 / 1689 loss=4.01, nll_loss=2.373, ppl=5.18, wps=455647, ups=1.06, wpb=431826, bsz=16422.7, num_updates=30600, lr=0.000361551, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=30409 epoch 019: 247 / 1689 loss=4.01, nll_loss=2.373, ppl=5.18, wps=455647, ups=1.06, wpb=431826, bsz=16422.7, num_updates=30600, lr=0.000361551, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=30409 epoch 019: 247 / 1689 loss=4.01, nll_loss=2.373, ppl=5.18, wps=455647, ups=1.06, wpb=431826, bsz=16422.7, num_updates=30600, lr=0.000361551, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=30409 epoch 019: 247 / 1689 loss=4.01, nll_loss=2.373, ppl=5.18, wps=455647, ups=1.06, wpb=431826, bsz=16422.7, num_updates=30600, lr=0.000361551, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=30409 epoch 019: 247 / 1689 loss=4.01, nll_loss=2.373, ppl=5.18, wps=455647, ups=1.06, wpb=431826, bsz=16422.7, num_updates=30600, lr=0.000361551, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=30409 epoch 019: 247 / 1689 loss=4.01, nll_loss=2.373, ppl=5.18, wps=455647, ups=1.06, wpb=431826, bsz=16422.7, num_updates=30600, lr=0.000361551, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=30409 epoch 019: 247 / 1689 loss=4.01, nll_loss=2.373, ppl=5.18, wps=455647, ups=1.06, wpb=431826, bsz=16422.7, num_updates=30600, lr=0.000361551, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=30409 epoch 019: 247 / 1689 loss=4.01, nll_loss=2.373, ppl=5.18, wps=455647, ups=1.06, wpb=431826, bsz=16422.7, num_updates=30600, lr=0.000361551, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=30409 epoch 019: 247 / 1689 loss=4.01, nll_loss=2.373, ppl=5.18, wps=455647, ups=1.06, wpb=431826, bsz=16422.7, num_updates=30600, lr=0.000361551, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=30409 epoch 019: 247 / 1689 loss=4.01, nll_loss=2.373, ppl=5.18, wps=455647, ups=1.06, wpb=431826, bsz=16422.7, num_updates=30600, lr=0.000361551, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=30409 epoch 019: 247 / 1689 loss=4.01, nll_loss=2.373, ppl=5.18, wps=455647, ups=1.06, wpb=431826, bsz=16422.7, num_updates=30600, lr=0.000361551, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=30409 epoch 019: 247 / 1689 loss=4.01, nll_loss=2.373, ppl=5.18, wps=455647, ups=1.06, wpb=431826, bsz=16422.7, num_updates=30600, lr=0.000361551, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=30409 epoch 019: 247 / 1689 loss=4.01, nll_loss=2.373, ppl=5.18, wps=455647, ups=1.06, wpb=431826, bsz=16422.7, num_updates=30600, lr=0.000361551, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=30409 epoch 019: 247 / 1689 loss=4.01, nll_loss=2.373, ppl=5.18, wps=455647, ups=1.06, wpb=431826, bsz=16422.7, num_updates=30600, lr=0.000361551, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=30409 epoch 019: 247 / 1689 loss=4.01, nll_loss=2.373, ppl=5.18, wps=455647, ups=1.06, wpb=431826, bsz=16422.7, num_updates=30600, lr=0.000361551, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=30409 epoch 019: 247 / 1689 loss=4.01, nll_loss=2.373, ppl=5.18, wps=455647, ups=1.06, wpb=431826, bsz=16422.7, num_updates=30600, lr=0.000361551, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=30409 epoch 019: 247 / 1689 loss=4.01, nll_loss=2.373, ppl=5.18, wps=455647, ups=1.06, wpb=431826, bsz=16422.7, num_updates=30600, lr=0.000361551, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=30409 epoch 019: 247 / 1689 loss=4.01, nll_loss=2.373, ppl=5.18, wps=455647, ups=1.06, wpb=431826, bsz=16422.7, num_updates=30600, lr=0.000361551, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=30409 epoch 019: 247 / 1689 loss=4.01, nll_loss=2.373, ppl=5.18, wps=455647, ups=1.06, wpb=431826, bsz=16422.7, num_updates=30600, lr=0.000361551, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=30409 epoch 019: 347 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=457375, ups=1.06, wpb=433285, bsz=16636, num_updates=30700, lr=0.000360961, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=30504 epoch 019: 347 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=457375, ups=1.06, wpb=433285, bsz=16636, num_updates=30700, lr=0.000360961, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=30504 epoch 019: 347 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=457375, ups=1.06, wpb=433285, bsz=16636, num_updates=30700, lr=0.000360961, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=30504 epoch 019: 347 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=457375, ups=1.06, wpb=433285, bsz=16636, num_updates=30700, lr=0.000360961, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=30504 epoch 019: 347 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=457375, ups=1.06, wpb=433285, bsz=16636, num_updates=30700, lr=0.000360961, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=30504 epoch 019: 347 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=457375, ups=1.06, wpb=433285, bsz=16636, num_updates=30700, lr=0.000360961, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=30504 epoch 019: 347 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=457375, ups=1.06, wpb=433285, bsz=16636, num_updates=30700, lr=0.000360961, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=30504 epoch 019: 347 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=457375, ups=1.06, wpb=433285, bsz=16636, num_updates=30700, lr=0.000360961, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=30504 epoch 019: 347 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=457375, ups=1.06, wpb=433285, bsz=16636, num_updates=30700, lr=0.000360961, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=30504 epoch 019: 347 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=457375, ups=1.06, wpb=433285, bsz=16636, num_updates=30700, lr=0.000360961, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=30504 epoch 019: 347 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=457375, ups=1.06, wpb=433285, bsz=16636, num_updates=30700, lr=0.000360961, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=30504 epoch 019: 347 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=457375, ups=1.06, wpb=433285, bsz=16636, num_updates=30700, lr=0.000360961, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=30504 epoch 019: 347 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=457375, ups=1.06, wpb=433285, bsz=16636, num_updates=30700, lr=0.000360961, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=30504 epoch 019: 347 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=457375, ups=1.06, wpb=433285, bsz=16636, num_updates=30700, lr=0.000360961, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=30504 epoch 019: 347 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=457375, ups=1.06, wpb=433285, bsz=16636, num_updates=30700, lr=0.000360961, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=30504 epoch 019: 347 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=457375, ups=1.06, wpb=433285, bsz=16636, num_updates=30700, lr=0.000360961, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=30504 epoch 019: 347 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=457375, ups=1.06, wpb=433285, bsz=16636, num_updates=30700, lr=0.000360961, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=30504 epoch 019: 347 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=457375, ups=1.06, wpb=433285, bsz=16636, num_updates=30700, lr=0.000360961, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=30504 epoch 019: 347 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=457375, ups=1.06, wpb=433285, bsz=16636, num_updates=30700, lr=0.000360961, gnorm=0.223, clip=0, loss_scale=1, train_wall=93, gb_free=20.1, wall=30504 epoch 019: 447 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=458208, ups=1.06, wpb=434189, bsz=16232.3, num_updates=30800, lr=0.000360375, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=30598 epoch 019: 447 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=458208, ups=1.06, wpb=434189, bsz=16232.3, num_updates=30800, lr=0.000360375, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=30598 epoch 019: 447 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=458208, ups=1.06, wpb=434189, bsz=16232.3, num_updates=30800, lr=0.000360375, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=30598 epoch 019: 447 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=458208, ups=1.06, wpb=434189, bsz=16232.3, num_updates=30800, lr=0.000360375, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=30598 epoch 019: 447 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=458208, ups=1.06, wpb=434189, bsz=16232.3, num_updates=30800, lr=0.000360375, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=30598 epoch 019: 447 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=458208, ups=1.06, wpb=434189, bsz=16232.3, num_updates=30800, lr=0.000360375, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=30598 epoch 019: 447 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=458208, ups=1.06, wpb=434189, bsz=16232.3, num_updates=30800, lr=0.000360375, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=30598 epoch 019: 447 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=458208, ups=1.06, wpb=434189, bsz=16232.3, num_updates=30800, lr=0.000360375, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=30598 epoch 019: 447 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=458208, ups=1.06, wpb=434189, bsz=16232.3, num_updates=30800, lr=0.000360375, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=30598 epoch 019: 447 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=458208, ups=1.06, wpb=434189, bsz=16232.3, num_updates=30800, lr=0.000360375, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=30598 epoch 019: 447 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=458208, ups=1.06, wpb=434189, bsz=16232.3, num_updates=30800, lr=0.000360375, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=30598 epoch 019: 447 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=458208, ups=1.06, wpb=434189, bsz=16232.3, num_updates=30800, lr=0.000360375, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=30598 epoch 019: 447 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=458208, ups=1.06, wpb=434189, bsz=16232.3, num_updates=30800, lr=0.000360375, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=30598 epoch 019: 447 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=458208, ups=1.06, wpb=434189, bsz=16232.3, num_updates=30800, lr=0.000360375, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=30598 epoch 019: 447 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=458208, ups=1.06, wpb=434189, bsz=16232.3, num_updates=30800, lr=0.000360375, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=30598 epoch 019: 447 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=458208, ups=1.06, wpb=434189, bsz=16232.3, num_updates=30800, lr=0.000360375, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=30598 epoch 019: 447 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=458208, ups=1.06, wpb=434189, bsz=16232.3, num_updates=30800, lr=0.000360375, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=30598 epoch 019: 447 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=458208, ups=1.06, wpb=434189, bsz=16232.3, num_updates=30800, lr=0.000360375, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=30598 epoch 019: 447 / 1689 loss=4.019, nll_loss=2.383, ppl=5.22, wps=458208, ups=1.06, wpb=434189, bsz=16232.3, num_updates=30800, lr=0.000360375, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=20.4, wall=30598 epoch 019: 547 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=460415, ups=1.06, wpb=434238, bsz=16543.5, num_updates=30900, lr=0.000359791, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=30693 epoch 019: 547 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=460415, ups=1.06, wpb=434238, bsz=16543.5, num_updates=30900, lr=0.000359791, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=30693 epoch 019: 547 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=460415, ups=1.06, wpb=434238, bsz=16543.5, num_updates=30900, lr=0.000359791, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=30693 epoch 019: 547 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=460415, ups=1.06, wpb=434238, bsz=16543.5, num_updates=30900, lr=0.000359791, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=30693 epoch 019: 547 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=460415, ups=1.06, wpb=434238, bsz=16543.5, num_updates=30900, lr=0.000359791, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=30693 epoch 019: 547 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=460415, ups=1.06, wpb=434238, bsz=16543.5, num_updates=30900, lr=0.000359791, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=30693 epoch 019: 547 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=460415, ups=1.06, wpb=434238, bsz=16543.5, num_updates=30900, lr=0.000359791, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=30693 epoch 019: 547 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=460415, ups=1.06, wpb=434238, bsz=16543.5, num_updates=30900, lr=0.000359791, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=30693 epoch 019: 547 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=460415, ups=1.06, wpb=434238, bsz=16543.5, num_updates=30900, lr=0.000359791, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=30693 epoch 019: 547 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=460415, ups=1.06, wpb=434238, bsz=16543.5, num_updates=30900, lr=0.000359791, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=30693 epoch 019: 547 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=460415, ups=1.06, wpb=434238, bsz=16543.5, num_updates=30900, lr=0.000359791, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=30693 epoch 019: 547 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=460415, ups=1.06, wpb=434238, bsz=16543.5, num_updates=30900, lr=0.000359791, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=30693 epoch 019: 547 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=460415, ups=1.06, wpb=434238, bsz=16543.5, num_updates=30900, lr=0.000359791, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=30693 epoch 019: 547 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=460415, ups=1.06, wpb=434238, bsz=16543.5, num_updates=30900, lr=0.000359791, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=30693 epoch 019: 547 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=460415, ups=1.06, wpb=434238, bsz=16543.5, num_updates=30900, lr=0.000359791, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=30693 epoch 019: 547 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=460415, ups=1.06, wpb=434238, bsz=16543.5, num_updates=30900, lr=0.000359791, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=30693 epoch 019: 547 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=460415, ups=1.06, wpb=434238, bsz=16543.5, num_updates=30900, lr=0.000359791, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=30693 epoch 019: 547 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=460415, ups=1.06, wpb=434238, bsz=16543.5, num_updates=30900, lr=0.000359791, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=30693 epoch 019: 547 / 1689 loss=4.045, nll_loss=2.413, ppl=5.32, wps=460415, ups=1.06, wpb=434238, bsz=16543.5, num_updates=30900, lr=0.000359791, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=21, wall=30693 epoch 019: 647 / 1689 loss=4.034, nll_loss=2.4, ppl=5.28, wps=455523, ups=1.05, wpb=433274, bsz=16896.8, num_updates=31000, lr=0.000359211, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30788 epoch 019: 647 / 1689 loss=4.034, nll_loss=2.4, ppl=5.28, wps=455523, ups=1.05, wpb=433274, bsz=16896.8, num_updates=31000, lr=0.000359211, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30788 epoch 019: 647 / 1689 loss=4.034, nll_loss=2.4, ppl=5.28, wps=455523, ups=1.05, wpb=433274, bsz=16896.8, num_updates=31000, lr=0.000359211, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30788 epoch 019: 647 / 1689 loss=4.034, nll_loss=2.4, ppl=5.28, wps=455523, ups=1.05, wpb=433274, bsz=16896.8, num_updates=31000, lr=0.000359211, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30788 epoch 019: 647 / 1689 loss=4.034, nll_loss=2.4, ppl=5.28, wps=455523, ups=1.05, wpb=433274, bsz=16896.8, num_updates=31000, lr=0.000359211, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30788 epoch 019: 647 / 1689 loss=4.034, nll_loss=2.4, ppl=5.28, wps=455523, ups=1.05, wpb=433274, bsz=16896.8, num_updates=31000, lr=0.000359211, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30788 epoch 019: 647 / 1689 loss=4.034, nll_loss=2.4, ppl=5.28, wps=455523, ups=1.05, wpb=433274, bsz=16896.8, num_updates=31000, lr=0.000359211, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30788 epoch 019: 647 / 1689 loss=4.034, nll_loss=2.4, ppl=5.28, wps=455523, ups=1.05, wpb=433274, bsz=16896.8, num_updates=31000, lr=0.000359211, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30788 epoch 019: 647 / 1689 loss=4.034, nll_loss=2.4, ppl=5.28, wps=455523, ups=1.05, wpb=433274, bsz=16896.8, num_updates=31000, lr=0.000359211, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30788 epoch 019: 647 / 1689 loss=4.034, nll_loss=2.4, ppl=5.28, wps=455523, ups=1.05, wpb=433274, bsz=16896.8, num_updates=31000, lr=0.000359211, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30788 epoch 019: 647 / 1689 loss=4.034, nll_loss=2.4, ppl=5.28, wps=455523, ups=1.05, wpb=433274, bsz=16896.8, num_updates=31000, lr=0.000359211, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30788 epoch 019: 647 / 1689 loss=4.034, nll_loss=2.4, ppl=5.28, wps=455523, ups=1.05, wpb=433274, bsz=16896.8, num_updates=31000, lr=0.000359211, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30788 epoch 019: 647 / 1689 loss=4.034, nll_loss=2.4, ppl=5.28, wps=455523, ups=1.05, wpb=433274, bsz=16896.8, num_updates=31000, lr=0.000359211, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30788 epoch 019: 647 / 1689 loss=4.034, nll_loss=2.4, ppl=5.28, wps=455523, ups=1.05, wpb=433274, bsz=16896.8, num_updates=31000, lr=0.000359211, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30788 epoch 019: 647 / 1689 loss=4.034, nll_loss=2.4, ppl=5.28, wps=455523, ups=1.05, wpb=433274, bsz=16896.8, num_updates=31000, lr=0.000359211, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30788 epoch 019: 647 / 1689 loss=4.034, nll_loss=2.4, ppl=5.28, wps=455523, ups=1.05, wpb=433274, bsz=16896.8, num_updates=31000, lr=0.000359211, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30788 epoch 019: 647 / 1689 loss=4.034, nll_loss=2.4, ppl=5.28, wps=455523, ups=1.05, wpb=433274, bsz=16896.8, num_updates=31000, lr=0.000359211, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30788 epoch 019: 647 / 1689 loss=4.034, nll_loss=2.4, ppl=5.28, wps=455523, ups=1.05, wpb=433274, bsz=16896.8, num_updates=31000, lr=0.000359211, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30788 epoch 019: 647 / 1689 loss=4.034, nll_loss=2.4, ppl=5.28, wps=455523, ups=1.05, wpb=433274, bsz=16896.8, num_updates=31000, lr=0.000359211, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=30788 begin validation on "valid" subset epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.259 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.259 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.259 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.259 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.259 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.259 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.259 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.259 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.259 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.259 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.259 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.259 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.259 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.259 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.259 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.259 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.259 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.259 epoch 019 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 31000 | best_loss 4.259 epoch 019: 747 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=398672, ups=0.92, wpb=432329, bsz=16761.6, num_updates=31100, lr=0.000358633, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30896 epoch 019: 747 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=398672, ups=0.92, wpb=432329, bsz=16761.6, num_updates=31100, lr=0.000358633, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30896 epoch 019: 747 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=398672, ups=0.92, wpb=432329, bsz=16761.6, num_updates=31100, lr=0.000358633, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30896 epoch 019: 747 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=398672, ups=0.92, wpb=432329, bsz=16761.6, num_updates=31100, lr=0.000358633, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30896 epoch 019: 747 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=398672, ups=0.92, wpb=432329, bsz=16761.6, num_updates=31100, lr=0.000358633, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30896 epoch 019: 747 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=398672, ups=0.92, wpb=432329, bsz=16761.6, num_updates=31100, lr=0.000358633, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30896 epoch 019: 747 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=398672, ups=0.92, wpb=432329, bsz=16761.6, num_updates=31100, lr=0.000358633, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30896 epoch 019: 747 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=398672, ups=0.92, wpb=432329, bsz=16761.6, num_updates=31100, lr=0.000358633, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30896 epoch 019: 747 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=398672, ups=0.92, wpb=432329, bsz=16761.6, num_updates=31100, lr=0.000358633, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30896 epoch 019: 747 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=398672, ups=0.92, wpb=432329, bsz=16761.6, num_updates=31100, lr=0.000358633, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30896 epoch 019: 747 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=398672, ups=0.92, wpb=432329, bsz=16761.6, num_updates=31100, lr=0.000358633, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30896 epoch 019: 747 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=398672, ups=0.92, wpb=432329, bsz=16761.6, num_updates=31100, lr=0.000358633, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30896 epoch 019: 747 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=398672, ups=0.92, wpb=432329, bsz=16761.6, num_updates=31100, lr=0.000358633, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30896 epoch 019: 747 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=398672, ups=0.92, wpb=432329, bsz=16761.6, num_updates=31100, lr=0.000358633, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30896 epoch 019: 747 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=398672, ups=0.92, wpb=432329, bsz=16761.6, num_updates=31100, lr=0.000358633, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30896 epoch 019: 747 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=398672, ups=0.92, wpb=432329, bsz=16761.6, num_updates=31100, lr=0.000358633, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30896 epoch 019: 747 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=398672, ups=0.92, wpb=432329, bsz=16761.6, num_updates=31100, lr=0.000358633, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30896 epoch 019: 747 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=398672, ups=0.92, wpb=432329, bsz=16761.6, num_updates=31100, lr=0.000358633, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30896 epoch 019: 747 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=398672, ups=0.92, wpb=432329, bsz=16761.6, num_updates=31100, lr=0.000358633, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19, wall=30896 epoch 019: 848 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=459286, ups=1.06, wpb=435327, bsz=16358.4, num_updates=31200, lr=0.000358057, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30991 epoch 019: 848 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=459286, ups=1.06, wpb=435327, bsz=16358.4, num_updates=31200, lr=0.000358057, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30991 epoch 019: 848 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=459286, ups=1.06, wpb=435327, bsz=16358.4, num_updates=31200, lr=0.000358057, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30991 epoch 019: 848 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=459286, ups=1.06, wpb=435327, bsz=16358.4, num_updates=31200, lr=0.000358057, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30991 epoch 019: 848 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=459286, ups=1.06, wpb=435327, bsz=16358.4, num_updates=31200, lr=0.000358057, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30991 epoch 019: 848 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=459286, ups=1.06, wpb=435327, bsz=16358.4, num_updates=31200, lr=0.000358057, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30991 epoch 019: 848 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=459286, ups=1.06, wpb=435327, bsz=16358.4, num_updates=31200, lr=0.000358057, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30991 epoch 019: 848 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=459286, ups=1.06, wpb=435327, bsz=16358.4, num_updates=31200, lr=0.000358057, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30991 epoch 019: 848 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=459286, ups=1.06, wpb=435327, bsz=16358.4, num_updates=31200, lr=0.000358057, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30991 epoch 019: 848 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=459286, ups=1.06, wpb=435327, bsz=16358.4, num_updates=31200, lr=0.000358057, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30991 epoch 019: 848 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=459286, ups=1.06, wpb=435327, bsz=16358.4, num_updates=31200, lr=0.000358057, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30991 epoch 019: 848 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=459286, ups=1.06, wpb=435327, bsz=16358.4, num_updates=31200, lr=0.000358057, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30991 epoch 019: 848 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=459286, ups=1.06, wpb=435327, bsz=16358.4, num_updates=31200, lr=0.000358057, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30991 epoch 019: 848 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=459286, ups=1.06, wpb=435327, bsz=16358.4, num_updates=31200, lr=0.000358057, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30991 epoch 019: 848 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=459286, ups=1.06, wpb=435327, bsz=16358.4, num_updates=31200, lr=0.000358057, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30991 epoch 019: 848 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=459286, ups=1.06, wpb=435327, bsz=16358.4, num_updates=31200, lr=0.000358057, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30991 epoch 019: 848 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=459286, ups=1.06, wpb=435327, bsz=16358.4, num_updates=31200, lr=0.000358057, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30991 epoch 019: 848 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=459286, ups=1.06, wpb=435327, bsz=16358.4, num_updates=31200, lr=0.000358057, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30991 epoch 019: 848 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=459286, ups=1.06, wpb=435327, bsz=16358.4, num_updates=31200, lr=0.000358057, gnorm=0.2, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=30991 epoch 019: 948 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=466115, ups=1.07, wpb=434143, bsz=16322.1, num_updates=31300, lr=0.000357485, gnorm=0.203, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31084 epoch 019: 948 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=466115, ups=1.07, wpb=434143, bsz=16322.1, num_updates=31300, lr=0.000357485, gnorm=0.203, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31084 epoch 019: 948 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=466115, ups=1.07, wpb=434143, bsz=16322.1, num_updates=31300, lr=0.000357485, gnorm=0.203, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31084 epoch 019: 948 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=466115, ups=1.07, wpb=434143, bsz=16322.1, num_updates=31300, lr=0.000357485, gnorm=0.203, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31084 epoch 019: 948 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=466115, ups=1.07, wpb=434143, bsz=16322.1, num_updates=31300, lr=0.000357485, gnorm=0.203, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31084 epoch 019: 948 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=466115, ups=1.07, wpb=434143, bsz=16322.1, num_updates=31300, lr=0.000357485, gnorm=0.203, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31084 epoch 019: 948 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=466115, ups=1.07, wpb=434143, bsz=16322.1, num_updates=31300, lr=0.000357485, gnorm=0.203, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31084 epoch 019: 948 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=466115, ups=1.07, wpb=434143, bsz=16322.1, num_updates=31300, lr=0.000357485, gnorm=0.203, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31084 epoch 019: 948 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=466115, ups=1.07, wpb=434143, bsz=16322.1, num_updates=31300, lr=0.000357485, gnorm=0.203, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31084 epoch 019: 948 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=466115, ups=1.07, wpb=434143, bsz=16322.1, num_updates=31300, lr=0.000357485, gnorm=0.203, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31084 epoch 019: 948 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=466115, ups=1.07, wpb=434143, bsz=16322.1, num_updates=31300, lr=0.000357485, gnorm=0.203, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31084 epoch 019: 948 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=466115, ups=1.07, wpb=434143, bsz=16322.1, num_updates=31300, lr=0.000357485, gnorm=0.203, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31084 epoch 019: 948 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=466115, ups=1.07, wpb=434143, bsz=16322.1, num_updates=31300, lr=0.000357485, gnorm=0.203, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31084 epoch 019: 948 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=466115, ups=1.07, wpb=434143, bsz=16322.1, num_updates=31300, lr=0.000357485, gnorm=0.203, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31084 epoch 019: 948 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=466115, ups=1.07, wpb=434143, bsz=16322.1, num_updates=31300, lr=0.000357485, gnorm=0.203, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31084 epoch 019: 948 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=466115, ups=1.07, wpb=434143, bsz=16322.1, num_updates=31300, lr=0.000357485, gnorm=0.203, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31084 epoch 019: 948 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=466115, ups=1.07, wpb=434143, bsz=16322.1, num_updates=31300, lr=0.000357485, gnorm=0.203, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31084 epoch 019: 948 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=466115, ups=1.07, wpb=434143, bsz=16322.1, num_updates=31300, lr=0.000357485, gnorm=0.203, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31084 epoch 019: 948 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=466115, ups=1.07, wpb=434143, bsz=16322.1, num_updates=31300, lr=0.000357485, gnorm=0.203, clip=0, loss_scale=1, train_wall=91, gb_free=19.5, wall=31084 epoch 019: 1048 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=455339, ups=1.05, wpb=432368, bsz=16275.8, num_updates=31400, lr=0.000356915, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31179 epoch 019: 1048 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=455339, ups=1.05, wpb=432368, bsz=16275.8, num_updates=31400, lr=0.000356915, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31179 epoch 019: 1048 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=455339, ups=1.05, wpb=432368, bsz=16275.8, num_updates=31400, lr=0.000356915, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31179 epoch 019: 1048 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=455339, ups=1.05, wpb=432368, bsz=16275.8, num_updates=31400, lr=0.000356915, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31179 epoch 019: 1048 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=455339, ups=1.05, wpb=432368, bsz=16275.8, num_updates=31400, lr=0.000356915, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31179 epoch 019: 1048 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=455339, ups=1.05, wpb=432368, bsz=16275.8, num_updates=31400, lr=0.000356915, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31179 epoch 019: 1048 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=455339, ups=1.05, wpb=432368, bsz=16275.8, num_updates=31400, lr=0.000356915, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31179 epoch 019: 1048 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=455339, ups=1.05, wpb=432368, bsz=16275.8, num_updates=31400, lr=0.000356915, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31179 epoch 019: 1048 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=455339, ups=1.05, wpb=432368, bsz=16275.8, num_updates=31400, lr=0.000356915, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31179 epoch 019: 1048 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=455339, ups=1.05, wpb=432368, bsz=16275.8, num_updates=31400, lr=0.000356915, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31179 epoch 019: 1048 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=455339, ups=1.05, wpb=432368, bsz=16275.8, num_updates=31400, lr=0.000356915, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31179 epoch 019: 1048 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=455339, ups=1.05, wpb=432368, bsz=16275.8, num_updates=31400, lr=0.000356915, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31179 epoch 019: 1048 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=455339, ups=1.05, wpb=432368, bsz=16275.8, num_updates=31400, lr=0.000356915, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31179 epoch 019: 1048 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=455339, ups=1.05, wpb=432368, bsz=16275.8, num_updates=31400, lr=0.000356915, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31179 epoch 019: 1048 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=455339, ups=1.05, wpb=432368, bsz=16275.8, num_updates=31400, lr=0.000356915, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31179 epoch 019: 1048 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=455339, ups=1.05, wpb=432368, bsz=16275.8, num_updates=31400, lr=0.000356915, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31179 epoch 019: 1048 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=455339, ups=1.05, wpb=432368, bsz=16275.8, num_updates=31400, lr=0.000356915, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31179 epoch 019: 1048 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=455339, ups=1.05, wpb=432368, bsz=16275.8, num_updates=31400, lr=0.000356915, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31179 epoch 019: 1048 / 1689 loss=4.032, nll_loss=2.399, ppl=5.27, wps=455339, ups=1.05, wpb=432368, bsz=16275.8, num_updates=31400, lr=0.000356915, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31179 epoch 019: 1148 / 1689 loss=4.043, nll_loss=2.412, ppl=5.32, wps=462094, ups=1.06, wpb=435566, bsz=16368.5, num_updates=31500, lr=0.000356348, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=16.2, wall=31273 epoch 019: 1148 / 1689 loss=4.043, nll_loss=2.412, ppl=5.32, wps=462094, ups=1.06, wpb=435566, bsz=16368.5, num_updates=31500, lr=0.000356348, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=16.2, wall=31273 epoch 019: 1148 / 1689 loss=4.043, nll_loss=2.412, ppl=5.32, wps=462094, ups=1.06, wpb=435566, bsz=16368.5, num_updates=31500, lr=0.000356348, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=16.2, wall=31273 epoch 019: 1148 / 1689 loss=4.043, nll_loss=2.412, ppl=5.32, wps=462094, ups=1.06, wpb=435566, bsz=16368.5, num_updates=31500, lr=0.000356348, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=16.2, wall=31273 epoch 019: 1148 / 1689 loss=4.043, nll_loss=2.412, ppl=5.32, wps=462094, ups=1.06, wpb=435566, bsz=16368.5, num_updates=31500, lr=0.000356348, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=16.2, wall=31273 epoch 019: 1148 / 1689 loss=4.043, nll_loss=2.412, ppl=5.32, wps=462094, ups=1.06, wpb=435566, bsz=16368.5, num_updates=31500, lr=0.000356348, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=16.2, wall=31273 epoch 019: 1148 / 1689 loss=4.043, nll_loss=2.412, ppl=5.32, wps=462094, ups=1.06, wpb=435566, bsz=16368.5, num_updates=31500, lr=0.000356348, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=16.2, wall=31273 epoch 019: 1148 / 1689 loss=4.043, nll_loss=2.412, ppl=5.32, wps=462094, ups=1.06, wpb=435566, bsz=16368.5, num_updates=31500, lr=0.000356348, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=16.2, wall=31273 epoch 019: 1148 / 1689 loss=4.043, nll_loss=2.412, ppl=5.32, wps=462094, ups=1.06, wpb=435566, bsz=16368.5, num_updates=31500, lr=0.000356348, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=16.2, wall=31273 epoch 019: 1148 / 1689 loss=4.043, nll_loss=2.412, ppl=5.32, wps=462094, ups=1.06, wpb=435566, bsz=16368.5, num_updates=31500, lr=0.000356348, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=16.2, wall=31273 epoch 019: 1148 / 1689 loss=4.043, nll_loss=2.412, ppl=5.32, wps=462094, ups=1.06, wpb=435566, bsz=16368.5, num_updates=31500, lr=0.000356348, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=16.2, wall=31273 epoch 019: 1148 / 1689 loss=4.043, nll_loss=2.412, ppl=5.32, wps=462094, ups=1.06, wpb=435566, bsz=16368.5, num_updates=31500, lr=0.000356348, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=16.2, wall=31273 epoch 019: 1148 / 1689 loss=4.043, nll_loss=2.412, ppl=5.32, wps=462094, ups=1.06, wpb=435566, bsz=16368.5, num_updates=31500, lr=0.000356348, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=16.2, wall=31273 epoch 019: 1148 / 1689 loss=4.043, nll_loss=2.412, ppl=5.32, wps=462094, ups=1.06, wpb=435566, bsz=16368.5, num_updates=31500, lr=0.000356348, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=16.2, wall=31273 epoch 019: 1148 / 1689 loss=4.043, nll_loss=2.412, ppl=5.32, wps=462094, ups=1.06, wpb=435566, bsz=16368.5, num_updates=31500, lr=0.000356348, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=16.2, wall=31273 epoch 019: 1148 / 1689 loss=4.043, nll_loss=2.412, ppl=5.32, wps=462094, ups=1.06, wpb=435566, bsz=16368.5, num_updates=31500, lr=0.000356348, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=16.2, wall=31273 epoch 019: 1148 / 1689 loss=4.043, nll_loss=2.412, ppl=5.32, wps=462094, ups=1.06, wpb=435566, bsz=16368.5, num_updates=31500, lr=0.000356348, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=16.2, wall=31273 epoch 019: 1148 / 1689 loss=4.043, nll_loss=2.412, ppl=5.32, wps=462094, ups=1.06, wpb=435566, bsz=16368.5, num_updates=31500, lr=0.000356348, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=16.2, wall=31273 epoch 019: 1148 / 1689 loss=4.043, nll_loss=2.412, ppl=5.32, wps=462094, ups=1.06, wpb=435566, bsz=16368.5, num_updates=31500, lr=0.000356348, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=16.2, wall=31273 epoch 019: 1248 / 1689 loss=4.049, nll_loss=2.418, ppl=5.35, wps=459770, ups=1.06, wpb=434390, bsz=16747, num_updates=31600, lr=0.000355784, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31368 epoch 019: 1248 / 1689 loss=4.049, nll_loss=2.418, ppl=5.35, wps=459770, ups=1.06, wpb=434390, bsz=16747, num_updates=31600, lr=0.000355784, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31368 epoch 019: 1248 / 1689 loss=4.049, nll_loss=2.418, ppl=5.35, wps=459770, ups=1.06, wpb=434390, bsz=16747, num_updates=31600, lr=0.000355784, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31368 epoch 019: 1248 / 1689 loss=4.049, nll_loss=2.418, ppl=5.35, wps=459770, ups=1.06, wpb=434390, bsz=16747, num_updates=31600, lr=0.000355784, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31368 epoch 019: 1248 / 1689 loss=4.049, nll_loss=2.418, ppl=5.35, wps=459770, ups=1.06, wpb=434390, bsz=16747, num_updates=31600, lr=0.000355784, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31368 epoch 019: 1248 / 1689 loss=4.049, nll_loss=2.418, ppl=5.35, wps=459770, ups=1.06, wpb=434390, bsz=16747, num_updates=31600, lr=0.000355784, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31368 epoch 019: 1248 / 1689 loss=4.049, nll_loss=2.418, ppl=5.35, wps=459770, ups=1.06, wpb=434390, bsz=16747, num_updates=31600, lr=0.000355784, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31368 epoch 019: 1248 / 1689 loss=4.049, nll_loss=2.418, ppl=5.35, wps=459770, ups=1.06, wpb=434390, bsz=16747, num_updates=31600, lr=0.000355784, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31368 epoch 019: 1248 / 1689 loss=4.049, nll_loss=2.418, ppl=5.35, wps=459770, ups=1.06, wpb=434390, bsz=16747, num_updates=31600, lr=0.000355784, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31368 epoch 019: 1248 / 1689 loss=4.049, nll_loss=2.418, ppl=5.35, wps=459770, ups=1.06, wpb=434390, bsz=16747, num_updates=31600, lr=0.000355784, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31368 epoch 019: 1248 / 1689 loss=4.049, nll_loss=2.418, ppl=5.35, wps=459770, ups=1.06, wpb=434390, bsz=16747, num_updates=31600, lr=0.000355784, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31368 epoch 019: 1248 / 1689 loss=4.049, nll_loss=2.418, ppl=5.35, wps=459770, ups=1.06, wpb=434390, bsz=16747, num_updates=31600, lr=0.000355784, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31368 epoch 019: 1248 / 1689 loss=4.049, nll_loss=2.418, ppl=5.35, wps=459770, ups=1.06, wpb=434390, bsz=16747, num_updates=31600, lr=0.000355784, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31368 epoch 019: 1248 / 1689 loss=4.049, nll_loss=2.418, ppl=5.35, wps=459770, ups=1.06, wpb=434390, bsz=16747, num_updates=31600, lr=0.000355784, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31368 epoch 019: 1248 / 1689 loss=4.049, nll_loss=2.418, ppl=5.35, wps=459770, ups=1.06, wpb=434390, bsz=16747, num_updates=31600, lr=0.000355784, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31368 epoch 019: 1248 / 1689 loss=4.049, nll_loss=2.418, ppl=5.35, wps=459770, ups=1.06, wpb=434390, bsz=16747, num_updates=31600, lr=0.000355784, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31368 epoch 019: 1248 / 1689 loss=4.049, nll_loss=2.418, ppl=5.35, wps=459770, ups=1.06, wpb=434390, bsz=16747, num_updates=31600, lr=0.000355784, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31368 epoch 019: 1248 / 1689 loss=4.049, nll_loss=2.418, ppl=5.35, wps=459770, ups=1.06, wpb=434390, bsz=16747, num_updates=31600, lr=0.000355784, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31368 epoch 019: 1248 / 1689 loss=4.049, nll_loss=2.418, ppl=5.35, wps=459770, ups=1.06, wpb=434390, bsz=16747, num_updates=31600, lr=0.000355784, gnorm=0.221, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=31368 epoch 019: 1349 / 1689 loss=4.051, nll_loss=2.421, ppl=5.36, wps=454990, ups=1.04, wpb=435688, bsz=16695, num_updates=31700, lr=0.000355222, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=31464 epoch 019: 1349 / 1689 loss=4.051, nll_loss=2.421, ppl=5.36, wps=454990, ups=1.04, wpb=435688, bsz=16695, num_updates=31700, lr=0.000355222, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=31464 epoch 019: 1349 / 1689 loss=4.051, nll_loss=2.421, ppl=5.36, wps=454990, ups=1.04, wpb=435688, bsz=16695, num_updates=31700, lr=0.000355222, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=31464 epoch 019: 1349 / 1689 loss=4.051, nll_loss=2.421, ppl=5.36, wps=454990, ups=1.04, wpb=435688, bsz=16695, num_updates=31700, lr=0.000355222, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=31464 epoch 019: 1349 / 1689 loss=4.051, nll_loss=2.421, ppl=5.36, wps=454990, ups=1.04, wpb=435688, bsz=16695, num_updates=31700, lr=0.000355222, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=31464 epoch 019: 1349 / 1689 loss=4.051, nll_loss=2.421, ppl=5.36, wps=454990, ups=1.04, wpb=435688, bsz=16695, num_updates=31700, lr=0.000355222, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=31464 epoch 019: 1349 / 1689 loss=4.051, nll_loss=2.421, ppl=5.36, wps=454990, ups=1.04, wpb=435688, bsz=16695, num_updates=31700, lr=0.000355222, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=31464 epoch 019: 1349 / 1689 loss=4.051, nll_loss=2.421, ppl=5.36, wps=454990, ups=1.04, wpb=435688, bsz=16695, num_updates=31700, lr=0.000355222, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=31464 epoch 019: 1349 / 1689 loss=4.051, nll_loss=2.421, ppl=5.36, wps=454990, ups=1.04, wpb=435688, bsz=16695, num_updates=31700, lr=0.000355222, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=31464 epoch 019: 1349 / 1689 loss=4.051, nll_loss=2.421, ppl=5.36, wps=454990, ups=1.04, wpb=435688, bsz=16695, num_updates=31700, lr=0.000355222, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=31464 epoch 019: 1349 / 1689 loss=4.051, nll_loss=2.421, ppl=5.36, wps=454990, ups=1.04, wpb=435688, bsz=16695, num_updates=31700, lr=0.000355222, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=31464 epoch 019: 1349 / 1689 loss=4.051, nll_loss=2.421, ppl=5.36, wps=454990, ups=1.04, wpb=435688, bsz=16695, num_updates=31700, lr=0.000355222, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=31464 epoch 019: 1349 / 1689 loss=4.051, nll_loss=2.421, ppl=5.36, wps=454990, ups=1.04, wpb=435688, bsz=16695, num_updates=31700, lr=0.000355222, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=31464 epoch 019: 1349 / 1689 loss=4.051, nll_loss=2.421, ppl=5.36, wps=454990, ups=1.04, wpb=435688, bsz=16695, num_updates=31700, lr=0.000355222, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=31464 epoch 019: 1349 / 1689 loss=4.051, nll_loss=2.421, ppl=5.36, wps=454990, ups=1.04, wpb=435688, bsz=16695, num_updates=31700, lr=0.000355222, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=31464 epoch 019: 1349 / 1689 loss=4.051, nll_loss=2.421, ppl=5.36, wps=454990, ups=1.04, wpb=435688, bsz=16695, num_updates=31700, lr=0.000355222, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=31464 epoch 019: 1349 / 1689 loss=4.051, nll_loss=2.421, ppl=5.36, wps=454990, ups=1.04, wpb=435688, bsz=16695, num_updates=31700, lr=0.000355222, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=31464 epoch 019: 1349 / 1689 loss=4.051, nll_loss=2.421, ppl=5.36, wps=454990, ups=1.04, wpb=435688, bsz=16695, num_updates=31700, lr=0.000355222, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=31464 epoch 019: 1349 / 1689 loss=4.051, nll_loss=2.421, ppl=5.36, wps=454990, ups=1.04, wpb=435688, bsz=16695, num_updates=31700, lr=0.000355222, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=31464 epoch 019: 1449 / 1689 loss=4.035, nll_loss=2.403, ppl=5.29, wps=458589, ups=1.06, wpb=431656, bsz=16160.4, num_updates=31800, lr=0.000354663, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=31558 epoch 019: 1449 / 1689 loss=4.035, nll_loss=2.403, ppl=5.29, wps=458589, ups=1.06, wpb=431656, bsz=16160.4, num_updates=31800, lr=0.000354663, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=31558 epoch 019: 1449 / 1689 loss=4.035, nll_loss=2.403, ppl=5.29, wps=458589, ups=1.06, wpb=431656, bsz=16160.4, num_updates=31800, lr=0.000354663, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=31558 epoch 019: 1449 / 1689 loss=4.035, nll_loss=2.403, ppl=5.29, wps=458589, ups=1.06, wpb=431656, bsz=16160.4, num_updates=31800, lr=0.000354663, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=31558 epoch 019: 1449 / 1689 loss=4.035, nll_loss=2.403, ppl=5.29, wps=458589, ups=1.06, wpb=431656, bsz=16160.4, num_updates=31800, lr=0.000354663, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=31558 epoch 019: 1449 / 1689 loss=4.035, nll_loss=2.403, ppl=5.29, wps=458589, ups=1.06, wpb=431656, bsz=16160.4, num_updates=31800, lr=0.000354663, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=31558 epoch 019: 1449 / 1689 loss=4.035, nll_loss=2.403, ppl=5.29, wps=458589, ups=1.06, wpb=431656, bsz=16160.4, num_updates=31800, lr=0.000354663, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=31558 epoch 019: 1449 / 1689 loss=4.035, nll_loss=2.403, ppl=5.29, wps=458589, ups=1.06, wpb=431656, bsz=16160.4, num_updates=31800, lr=0.000354663, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=31558 epoch 019: 1449 / 1689 loss=4.035, nll_loss=2.403, ppl=5.29, wps=458589, ups=1.06, wpb=431656, bsz=16160.4, num_updates=31800, lr=0.000354663, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=31558 epoch 019: 1449 / 1689 loss=4.035, nll_loss=2.403, ppl=5.29, wps=458589, ups=1.06, wpb=431656, bsz=16160.4, num_updates=31800, lr=0.000354663, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=31558 epoch 019: 1449 / 1689 loss=4.035, nll_loss=2.403, ppl=5.29, wps=458589, ups=1.06, wpb=431656, bsz=16160.4, num_updates=31800, lr=0.000354663, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=31558 epoch 019: 1449 / 1689 loss=4.035, nll_loss=2.403, ppl=5.29, wps=458589, ups=1.06, wpb=431656, bsz=16160.4, num_updates=31800, lr=0.000354663, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=31558 epoch 019: 1449 / 1689 loss=4.035, nll_loss=2.403, ppl=5.29, wps=458589, ups=1.06, wpb=431656, bsz=16160.4, num_updates=31800, lr=0.000354663, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=31558 epoch 019: 1449 / 1689 loss=4.035, nll_loss=2.403, ppl=5.29, wps=458589, ups=1.06, wpb=431656, bsz=16160.4, num_updates=31800, lr=0.000354663, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=31558 epoch 019: 1449 / 1689 loss=4.035, nll_loss=2.403, ppl=5.29, wps=458589, ups=1.06, wpb=431656, bsz=16160.4, num_updates=31800, lr=0.000354663, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=31558 epoch 019: 1449 / 1689 loss=4.035, nll_loss=2.403, ppl=5.29, wps=458589, ups=1.06, wpb=431656, bsz=16160.4, num_updates=31800, lr=0.000354663, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=31558 epoch 019: 1449 / 1689 loss=4.035, nll_loss=2.403, ppl=5.29, wps=458589, ups=1.06, wpb=431656, bsz=16160.4, num_updates=31800, lr=0.000354663, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=31558 epoch 019: 1449 / 1689 loss=4.035, nll_loss=2.403, ppl=5.29, wps=458589, ups=1.06, wpb=431656, bsz=16160.4, num_updates=31800, lr=0.000354663, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=31558 epoch 019: 1449 / 1689 loss=4.035, nll_loss=2.403, ppl=5.29, wps=458589, ups=1.06, wpb=431656, bsz=16160.4, num_updates=31800, lr=0.000354663, gnorm=0.202, clip=0, loss_scale=1, train_wall=92, gb_free=19.7, wall=31558 epoch 019: 1549 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=459102, ups=1.06, wpb=434468, bsz=16272, num_updates=31900, lr=0.000354107, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=31652 epoch 019: 1549 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=459102, ups=1.06, wpb=434468, bsz=16272, num_updates=31900, lr=0.000354107, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=31652 epoch 019: 1549 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=459102, ups=1.06, wpb=434468, bsz=16272, num_updates=31900, lr=0.000354107, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=31652 epoch 019: 1549 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=459102, ups=1.06, wpb=434468, bsz=16272, num_updates=31900, lr=0.000354107, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=31652 epoch 019: 1549 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=459102, ups=1.06, wpb=434468, bsz=16272, num_updates=31900, lr=0.000354107, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=31652 epoch 019: 1549 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=459102, ups=1.06, wpb=434468, bsz=16272, num_updates=31900, lr=0.000354107, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=31652 epoch 019: 1549 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=459102, ups=1.06, wpb=434468, bsz=16272, num_updates=31900, lr=0.000354107, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=31652 epoch 019: 1549 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=459102, ups=1.06, wpb=434468, bsz=16272, num_updates=31900, lr=0.000354107, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=31652 epoch 019: 1549 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=459102, ups=1.06, wpb=434468, bsz=16272, num_updates=31900, lr=0.000354107, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=31652 epoch 019: 1549 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=459102, ups=1.06, wpb=434468, bsz=16272, num_updates=31900, lr=0.000354107, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=31652 epoch 019: 1549 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=459102, ups=1.06, wpb=434468, bsz=16272, num_updates=31900, lr=0.000354107, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=31652 epoch 019: 1549 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=459102, ups=1.06, wpb=434468, bsz=16272, num_updates=31900, lr=0.000354107, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=31652 epoch 019: 1549 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=459102, ups=1.06, wpb=434468, bsz=16272, num_updates=31900, lr=0.000354107, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=31652 epoch 019: 1549 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=459102, ups=1.06, wpb=434468, bsz=16272, num_updates=31900, lr=0.000354107, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=31652 epoch 019: 1549 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=459102, ups=1.06, wpb=434468, bsz=16272, num_updates=31900, lr=0.000354107, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=31652 epoch 019: 1549 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=459102, ups=1.06, wpb=434468, bsz=16272, num_updates=31900, lr=0.000354107, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=31652 epoch 019: 1549 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=459102, ups=1.06, wpb=434468, bsz=16272, num_updates=31900, lr=0.000354107, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=31652 epoch 019: 1549 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=459102, ups=1.06, wpb=434468, bsz=16272, num_updates=31900, lr=0.000354107, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=31652 epoch 019: 1549 / 1689 loss=4.047, nll_loss=2.416, ppl=5.34, wps=459102, ups=1.06, wpb=434468, bsz=16272, num_updates=31900, lr=0.000354107, gnorm=0.197, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=31652 epoch 019: 1649 / 1689 loss=4.05, nll_loss=2.42, ppl=5.35, wps=455557, ups=1.05, wpb=433418, bsz=16624.7, num_updates=32000, lr=0.000353553, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31747 epoch 019: 1649 / 1689 loss=4.05, nll_loss=2.42, ppl=5.35, wps=455557, ups=1.05, wpb=433418, bsz=16624.7, num_updates=32000, lr=0.000353553, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31747 epoch 019: 1649 / 1689 loss=4.05, nll_loss=2.42, ppl=5.35, wps=455557, ups=1.05, wpb=433418, bsz=16624.7, num_updates=32000, lr=0.000353553, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31747 epoch 019: 1649 / 1689 loss=4.05, nll_loss=2.42, ppl=5.35, wps=455557, ups=1.05, wpb=433418, bsz=16624.7, num_updates=32000, lr=0.000353553, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31747 epoch 019: 1649 / 1689 loss=4.05, nll_loss=2.42, ppl=5.35, wps=455557, ups=1.05, wpb=433418, bsz=16624.7, num_updates=32000, lr=0.000353553, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31747 epoch 019: 1649 / 1689 loss=4.05, nll_loss=2.42, ppl=5.35, wps=455557, ups=1.05, wpb=433418, bsz=16624.7, num_updates=32000, lr=0.000353553, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31747 epoch 019: 1649 / 1689 loss=4.05, nll_loss=2.42, ppl=5.35, wps=455557, ups=1.05, wpb=433418, bsz=16624.7, num_updates=32000, lr=0.000353553, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31747 epoch 019: 1649 / 1689 loss=4.05, nll_loss=2.42, ppl=5.35, wps=455557, ups=1.05, wpb=433418, bsz=16624.7, num_updates=32000, lr=0.000353553, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31747 epoch 019: 1649 / 1689 loss=4.05, nll_loss=2.42, ppl=5.35, wps=455557, ups=1.05, wpb=433418, bsz=16624.7, num_updates=32000, lr=0.000353553, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31747 epoch 019: 1649 / 1689 loss=4.05, nll_loss=2.42, ppl=5.35, wps=455557, ups=1.05, wpb=433418, bsz=16624.7, num_updates=32000, lr=0.000353553, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31747 epoch 019: 1649 / 1689 loss=4.05, nll_loss=2.42, ppl=5.35, wps=455557, ups=1.05, wpb=433418, bsz=16624.7, num_updates=32000, lr=0.000353553, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31747 epoch 019: 1649 / 1689 loss=4.05, nll_loss=2.42, ppl=5.35, wps=455557, ups=1.05, wpb=433418, bsz=16624.7, num_updates=32000, lr=0.000353553, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31747 epoch 019: 1649 / 1689 loss=4.05, nll_loss=2.42, ppl=5.35, wps=455557, ups=1.05, wpb=433418, bsz=16624.7, num_updates=32000, lr=0.000353553, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31747 epoch 019: 1649 / 1689 loss=4.05, nll_loss=2.42, ppl=5.35, wps=455557, ups=1.05, wpb=433418, bsz=16624.7, num_updates=32000, lr=0.000353553, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31747 epoch 019: 1649 / 1689 loss=4.05, nll_loss=2.42, ppl=5.35, wps=455557, ups=1.05, wpb=433418, bsz=16624.7, num_updates=32000, lr=0.000353553, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31747 epoch 019: 1649 / 1689 loss=4.05, nll_loss=2.42, ppl=5.35, wps=455557, ups=1.05, wpb=433418, bsz=16624.7, num_updates=32000, lr=0.000353553, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31747 epoch 019: 1649 / 1689 loss=4.05, nll_loss=2.42, ppl=5.35, wps=455557, ups=1.05, wpb=433418, bsz=16624.7, num_updates=32000, lr=0.000353553, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31747 epoch 019: 1649 / 1689 loss=4.05, nll_loss=2.42, ppl=5.35, wps=455557, ups=1.05, wpb=433418, bsz=16624.7, num_updates=32000, lr=0.000353553, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31747 epoch 019: 1649 / 1689 loss=4.05, nll_loss=2.42, ppl=5.35, wps=455557, ups=1.05, wpb=433418, bsz=16624.7, num_updates=32000, lr=0.000353553, gnorm=0.201, clip=0, loss_scale=1, train_wall=93, gb_free=19.4, wall=31747 begin validation on "valid" subset epoch 019 | valid on 'valid' subset | loss 4.249 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.249 epoch 019 | valid on 'valid' subset | loss 4.249 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.249 epoch 019 | valid on 'valid' subset | loss 4.249 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.249 epoch 019 | valid on 'valid' subset | loss 4.249 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.249 epoch 019 | valid on 'valid' subset | loss 4.249 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.249 epoch 019 | valid on 'valid' subset | loss 4.249 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.249 epoch 019 | valid on 'valid' subset | loss 4.249 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.249 epoch 019 | valid on 'valid' subset | loss 4.249 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.249 epoch 019 | valid on 'valid' subset | loss 4.249 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.249 epoch 019 | valid on 'valid' subset | loss 4.249 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.249 epoch 019 | valid on 'valid' subset | loss 4.249 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.249 epoch 019 | valid on 'valid' subset | loss 4.249 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.249 epoch 019 | valid on 'valid' subset | loss 4.249 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.249 epoch 019 | valid on 'valid' subset | loss 4.249 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.249 epoch 019 | valid on 'valid' subset | loss 4.249 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.249 epoch 019 | valid on 'valid' subset | loss 4.249 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.249 epoch 019 | valid on 'valid' subset | loss 4.249 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.249 epoch 019 | valid on 'valid' subset | loss 4.249 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.249 epoch 019 | valid on 'valid' subset | loss 4.249 | nll_loss 2.628 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 32000 | best_loss 4.249 end of epoch 19 (average epoch stats below) epoch 019 | loss 4.032 | nll_loss 2.399 | ppl 5.27 | wps 441203 | ups 1.02 | wpb 433496 | bsz 16502.8 | num_updates 32040 | lr 0.000353333 | gnorm 0.208 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 20.4 | wall 31831 epoch 019 | loss 4.032 | nll_loss 2.399 | ppl 5.27 | wps 441203 | ups 1.02 | wpb 433496 | bsz 16502.8 | num_updates 32040 | lr 0.000353333 | gnorm 0.208 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 20.4 | wall 31831 epoch 019 | loss 4.032 | nll_loss 2.399 | ppl 5.27 | wps 441203 | ups 1.02 | wpb 433496 | bsz 16502.8 | num_updates 32040 | lr 0.000353333 | gnorm 0.208 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 20.4 | wall 31831 epoch 019 | loss 4.032 | nll_loss 2.399 | ppl 5.27 | wps 441203 | ups 1.02 | wpb 433496 | bsz 16502.8 | num_updates 32040 | lr 0.000353333 | gnorm 0.208 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 20.4 | wall 31831 epoch 019 | loss 4.032 | nll_loss 2.399 | ppl 5.27 | wps 441203 | ups 1.02 | wpb 433496 | bsz 16502.8 | num_updates 32040 | lr 0.000353333 | gnorm 0.208 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 20.4 | wall 31831 epoch 019 | loss 4.032 | nll_loss 2.399 | ppl 5.27 | wps 441203 | ups 1.02 | wpb 433496 | bsz 16502.8 | num_updates 32040 | lr 0.000353333 | gnorm 0.208 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 20.4 | wall 31831 epoch 019 | loss 4.032 | nll_loss 2.399 | ppl 5.27 | wps 441203 | ups 1.02 | wpb 433496 | bsz 16502.8 | num_updates 32040 | lr 0.000353333 | gnorm 0.208 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 20.4 | wall 31831 epoch 019 | loss 4.032 | nll_loss 2.399 | ppl 5.27 | wps 441203 | ups 1.02 | wpb 433496 | bsz 16502.8 | num_updates 32040 | lr 0.000353333 | gnorm 0.208 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 20.4 | wall 31831 epoch 019 | loss 4.032 | nll_loss 2.399 | ppl 5.27 | wps 441203 | ups 1.02 | wpb 433496 | bsz 16502.8 | num_updates 32040 | lr 0.000353333 | gnorm 0.208 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 20.4 | wall 31831 epoch 019 | loss 4.032 | nll_loss 2.399 | ppl 5.27 | wps 441203 | ups 1.02 | wpb 433496 | bsz 16502.8 | num_updates 32040 | lr 0.000353333 | gnorm 0.208 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 20.4 | wall 31831 epoch 019 | loss 4.032 | nll_loss 2.399 | ppl 5.27 | wps 441203 | ups 1.02 | wpb 433496 | bsz 16502.8 | num_updates 32040 | lr 0.000353333 | gnorm 0.208 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 20.4 | wall 31831 epoch 019 | loss 4.032 | nll_loss 2.399 | ppl 5.27 | wps 441203 | ups 1.02 | wpb 433496 | bsz 16502.8 | num_updates 32040 | lr 0.000353333 | gnorm 0.208 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 20.4 | wall 31831 epoch 019 | loss 4.032 | nll_loss 2.399 | ppl 5.27 | wps 441203 | ups 1.02 | wpb 433496 | bsz 16502.8 | num_updates 32040 | lr 0.000353333 | gnorm 0.208 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 20.4 | wall 31831 epoch 019 | loss 4.032 | nll_loss 2.399 | ppl 5.27 | wps 441203 | ups 1.02 | wpb 433496 | bsz 16502.8 | num_updates 32040 | lr 0.000353333 | gnorm 0.208 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 20.4 | wall 31831 epoch 019 | loss 4.032 | nll_loss 2.399 | ppl 5.27 | wps 441203 | ups 1.02 | wpb 433496 | bsz 16502.8 | num_updates 32040 | lr 0.000353333 | gnorm 0.208 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 20.4 | wall 31831 epoch 019 | loss 4.032 | nll_loss 2.399 | ppl 5.27 | wps 441203 | ups 1.02 | wpb 433496 | bsz 16502.8 | num_updates 32040 | lr 0.000353333 | gnorm 0.208 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 20.4 | wall 31831 epoch 019 | loss 4.032 | nll_loss 2.399 | ppl 5.27 | wps 441203 | ups 1.02 | wpb 433496 | bsz 16502.8 | num_updates 32040 | lr 0.000353333 | gnorm 0.208 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 20.4 | wall 31831 epoch 019 | loss 4.032 | nll_loss 2.399 | ppl 5.27 | wps 441203 | ups 1.02 | wpb 433496 | bsz 16502.8 | num_updates 32040 | lr 0.000353333 | gnorm 0.208 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 20.4 | wall 31831 epoch 019 | loss 4.032 | nll_loss 2.399 | ppl 5.27 | wps 441203 | ups 1.02 | wpb 433496 | bsz 16502.8 | num_updates 32040 | lr 0.000353333 | gnorm 0.208 | clip 0 | loss_scale 1 | train_wall 1588 | gb_free 20.4 | wall 31831 Start iterating over samples epoch 020: 60 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=304711, ups=0.71, wpb=429386, bsz=16218.6, num_updates=32100, lr=0.000353002, gnorm=0.218, clip=0, loss_scale=1, train_wall=115, gb_free=18.9, wall=31888 epoch 020: 60 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=304711, ups=0.71, wpb=429386, bsz=16218.6, num_updates=32100, lr=0.000353002, gnorm=0.218, clip=0, loss_scale=1, train_wall=115, gb_free=18.9, wall=31888 epoch 020: 60 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=304711, ups=0.71, wpb=429386, bsz=16218.6, num_updates=32100, lr=0.000353002, gnorm=0.218, clip=0, loss_scale=1, train_wall=115, gb_free=18.9, wall=31888 epoch 020: 60 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=304711, ups=0.71, wpb=429386, bsz=16218.6, num_updates=32100, lr=0.000353002, gnorm=0.218, clip=0, loss_scale=1, train_wall=115, gb_free=18.9, wall=31888 epoch 020: 60 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=304711, ups=0.71, wpb=429386, bsz=16218.6, num_updates=32100, lr=0.000353002, gnorm=0.218, clip=0, loss_scale=1, train_wall=115, gb_free=18.9, wall=31888 epoch 020: 60 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=304711, ups=0.71, wpb=429386, bsz=16218.6, num_updates=32100, lr=0.000353002, gnorm=0.218, clip=0, loss_scale=1, train_wall=115, gb_free=18.9, wall=31888 epoch 020: 60 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=304711, ups=0.71, wpb=429386, bsz=16218.6, num_updates=32100, lr=0.000353002, gnorm=0.218, clip=0, loss_scale=1, train_wall=115, gb_free=18.9, wall=31888 epoch 020: 60 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=304711, ups=0.71, wpb=429386, bsz=16218.6, num_updates=32100, lr=0.000353002, gnorm=0.218, clip=0, loss_scale=1, train_wall=115, gb_free=18.9, wall=31888 epoch 020: 60 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=304711, ups=0.71, wpb=429386, bsz=16218.6, num_updates=32100, lr=0.000353002, gnorm=0.218, clip=0, loss_scale=1, train_wall=115, gb_free=18.9, wall=31888 epoch 020: 60 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=304711, ups=0.71, wpb=429386, bsz=16218.6, num_updates=32100, lr=0.000353002, gnorm=0.218, clip=0, loss_scale=1, train_wall=115, gb_free=18.9, wall=31888 epoch 020: 60 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=304711, ups=0.71, wpb=429386, bsz=16218.6, num_updates=32100, lr=0.000353002, gnorm=0.218, clip=0, loss_scale=1, train_wall=115, gb_free=18.9, wall=31888 epoch 020: 60 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=304711, ups=0.71, wpb=429386, bsz=16218.6, num_updates=32100, lr=0.000353002, gnorm=0.218, clip=0, loss_scale=1, train_wall=115, gb_free=18.9, wall=31888 epoch 020: 60 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=304711, ups=0.71, wpb=429386, bsz=16218.6, num_updates=32100, lr=0.000353002, gnorm=0.218, clip=0, loss_scale=1, train_wall=115, gb_free=18.9, wall=31888 epoch 020: 60 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=304711, ups=0.71, wpb=429386, bsz=16218.6, num_updates=32100, lr=0.000353002, gnorm=0.218, clip=0, loss_scale=1, train_wall=115, gb_free=18.9, wall=31888 epoch 020: 60 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=304711, ups=0.71, wpb=429386, bsz=16218.6, num_updates=32100, lr=0.000353002, gnorm=0.218, clip=0, loss_scale=1, train_wall=115, gb_free=18.9, wall=31888 epoch 020: 60 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=304711, ups=0.71, wpb=429386, bsz=16218.6, num_updates=32100, lr=0.000353002, gnorm=0.218, clip=0, loss_scale=1, train_wall=115, gb_free=18.9, wall=31888 epoch 020: 60 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=304711, ups=0.71, wpb=429386, bsz=16218.6, num_updates=32100, lr=0.000353002, gnorm=0.218, clip=0, loss_scale=1, train_wall=115, gb_free=18.9, wall=31888 epoch 020: 60 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=304711, ups=0.71, wpb=429386, bsz=16218.6, num_updates=32100, lr=0.000353002, gnorm=0.218, clip=0, loss_scale=1, train_wall=115, gb_free=18.9, wall=31888 epoch 020: 60 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=304711, ups=0.71, wpb=429386, bsz=16218.6, num_updates=32100, lr=0.000353002, gnorm=0.218, clip=0, loss_scale=1, train_wall=115, gb_free=18.9, wall=31888 epoch 020: 60 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=304711, ups=0.71, wpb=429386, bsz=16218.6, num_updates=32100, lr=0.000353002, gnorm=0.218, clip=0, loss_scale=1, train_wall=115, gb_free=18.9, wall=31888 epoch 020: 161 / 1689 loss=4.005, nll_loss=2.367, ppl=5.16, wps=457750, ups=1.06, wpb=433526, bsz=16275.6, num_updates=32200, lr=0.000352454, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=31983 epoch 020: 161 / 1689 loss=4.005, nll_loss=2.367, ppl=5.16, wps=457750, ups=1.06, wpb=433526, bsz=16275.6, num_updates=32200, lr=0.000352454, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=31983 epoch 020: 161 / 1689 loss=4.005, nll_loss=2.367, ppl=5.16, wps=457750, ups=1.06, wpb=433526, bsz=16275.6, num_updates=32200, lr=0.000352454, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=31983 epoch 020: 161 / 1689 loss=4.005, nll_loss=2.367, ppl=5.16, wps=457750, ups=1.06, wpb=433526, bsz=16275.6, num_updates=32200, lr=0.000352454, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=31983 epoch 020: 161 / 1689 loss=4.005, nll_loss=2.367, ppl=5.16, wps=457750, ups=1.06, wpb=433526, bsz=16275.6, num_updates=32200, lr=0.000352454, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=31983 epoch 020: 161 / 1689 loss=4.005, nll_loss=2.367, ppl=5.16, wps=457750, ups=1.06, wpb=433526, bsz=16275.6, num_updates=32200, lr=0.000352454, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=31983 epoch 020: 161 / 1689 loss=4.005, nll_loss=2.367, ppl=5.16, wps=457750, ups=1.06, wpb=433526, bsz=16275.6, num_updates=32200, lr=0.000352454, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=31983 epoch 020: 161 / 1689 loss=4.005, nll_loss=2.367, ppl=5.16, wps=457750, ups=1.06, wpb=433526, bsz=16275.6, num_updates=32200, lr=0.000352454, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=31983 epoch 020: 161 / 1689 loss=4.005, nll_loss=2.367, ppl=5.16, wps=457750, ups=1.06, wpb=433526, bsz=16275.6, num_updates=32200, lr=0.000352454, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=31983 epoch 020: 161 / 1689 loss=4.005, nll_loss=2.367, ppl=5.16, wps=457750, ups=1.06, wpb=433526, bsz=16275.6, num_updates=32200, lr=0.000352454, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=31983 epoch 020: 161 / 1689 loss=4.005, nll_loss=2.367, ppl=5.16, wps=457750, ups=1.06, wpb=433526, bsz=16275.6, num_updates=32200, lr=0.000352454, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=31983 epoch 020: 161 / 1689 loss=4.005, nll_loss=2.367, ppl=5.16, wps=457750, ups=1.06, wpb=433526, bsz=16275.6, num_updates=32200, lr=0.000352454, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=31983 epoch 020: 161 / 1689 loss=4.005, nll_loss=2.367, ppl=5.16, wps=457750, ups=1.06, wpb=433526, bsz=16275.6, num_updates=32200, lr=0.000352454, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=31983 epoch 020: 161 / 1689 loss=4.005, nll_loss=2.367, ppl=5.16, wps=457750, ups=1.06, wpb=433526, bsz=16275.6, num_updates=32200, lr=0.000352454, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=31983 epoch 020: 161 / 1689 loss=4.005, nll_loss=2.367, ppl=5.16, wps=457750, ups=1.06, wpb=433526, bsz=16275.6, num_updates=32200, lr=0.000352454, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=31983 epoch 020: 161 / 1689 loss=4.005, nll_loss=2.367, ppl=5.16, wps=457750, ups=1.06, wpb=433526, bsz=16275.6, num_updates=32200, lr=0.000352454, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=31983 epoch 020: 161 / 1689 loss=4.005, nll_loss=2.367, ppl=5.16, wps=457750, ups=1.06, wpb=433526, bsz=16275.6, num_updates=32200, lr=0.000352454, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=31983 epoch 020: 161 / 1689 loss=4.005, nll_loss=2.367, ppl=5.16, wps=457750, ups=1.06, wpb=433526, bsz=16275.6, num_updates=32200, lr=0.000352454, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=31983 epoch 020: 161 / 1689 loss=4.005, nll_loss=2.367, ppl=5.16, wps=457750, ups=1.06, wpb=433526, bsz=16275.6, num_updates=32200, lr=0.000352454, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=31983 epoch 020: 161 / 1689 loss=4.005, nll_loss=2.367, ppl=5.16, wps=457750, ups=1.06, wpb=433526, bsz=16275.6, num_updates=32200, lr=0.000352454, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=31983 epoch 020: 261 / 1689 loss=4.009, nll_loss=2.372, ppl=5.18, wps=458110, ups=1.06, wpb=433133, bsz=16373.1, num_updates=32300, lr=0.000351908, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=32078 epoch 020: 261 / 1689 loss=4.009, nll_loss=2.372, ppl=5.18, wps=458110, ups=1.06, wpb=433133, bsz=16373.1, num_updates=32300, lr=0.000351908, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=32078 epoch 020: 261 / 1689 loss=4.009, nll_loss=2.372, ppl=5.18, wps=458110, ups=1.06, wpb=433133, bsz=16373.1, num_updates=32300, lr=0.000351908, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=32078 epoch 020: 261 / 1689 loss=4.009, nll_loss=2.372, ppl=5.18, wps=458110, ups=1.06, wpb=433133, bsz=16373.1, num_updates=32300, lr=0.000351908, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=32078 epoch 020: 261 / 1689 loss=4.009, nll_loss=2.372, ppl=5.18, wps=458110, ups=1.06, wpb=433133, bsz=16373.1, num_updates=32300, lr=0.000351908, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=32078 epoch 020: 261 / 1689 loss=4.009, nll_loss=2.372, ppl=5.18, wps=458110, ups=1.06, wpb=433133, bsz=16373.1, num_updates=32300, lr=0.000351908, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=32078 epoch 020: 261 / 1689 loss=4.009, nll_loss=2.372, ppl=5.18, wps=458110, ups=1.06, wpb=433133, bsz=16373.1, num_updates=32300, lr=0.000351908, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=32078 epoch 020: 261 / 1689 loss=4.009, nll_loss=2.372, ppl=5.18, wps=458110, ups=1.06, wpb=433133, bsz=16373.1, num_updates=32300, lr=0.000351908, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=32078 epoch 020: 261 / 1689 loss=4.009, nll_loss=2.372, ppl=5.18, wps=458110, ups=1.06, wpb=433133, bsz=16373.1, num_updates=32300, lr=0.000351908, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=32078 epoch 020: 261 / 1689 loss=4.009, nll_loss=2.372, ppl=5.18, wps=458110, ups=1.06, wpb=433133, bsz=16373.1, num_updates=32300, lr=0.000351908, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=32078 epoch 020: 261 / 1689 loss=4.009, nll_loss=2.372, ppl=5.18, wps=458110, ups=1.06, wpb=433133, bsz=16373.1, num_updates=32300, lr=0.000351908, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=32078 epoch 020: 261 / 1689 loss=4.009, nll_loss=2.372, ppl=5.18, wps=458110, ups=1.06, wpb=433133, bsz=16373.1, num_updates=32300, lr=0.000351908, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=32078 epoch 020: 261 / 1689 loss=4.009, nll_loss=2.372, ppl=5.18, wps=458110, ups=1.06, wpb=433133, bsz=16373.1, num_updates=32300, lr=0.000351908, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=32078 epoch 020: 261 / 1689 loss=4.009, nll_loss=2.372, ppl=5.18, wps=458110, ups=1.06, wpb=433133, bsz=16373.1, num_updates=32300, lr=0.000351908, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=32078 epoch 020: 261 / 1689 loss=4.009, nll_loss=2.372, ppl=5.18, wps=458110, ups=1.06, wpb=433133, bsz=16373.1, num_updates=32300, lr=0.000351908, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=32078 epoch 020: 261 / 1689 loss=4.009, nll_loss=2.372, ppl=5.18, wps=458110, ups=1.06, wpb=433133, bsz=16373.1, num_updates=32300, lr=0.000351908, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=32078 epoch 020: 261 / 1689 loss=4.009, nll_loss=2.372, ppl=5.18, wps=458110, ups=1.06, wpb=433133, bsz=16373.1, num_updates=32300, lr=0.000351908, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=32078 epoch 020: 261 / 1689 loss=4.009, nll_loss=2.372, ppl=5.18, wps=458110, ups=1.06, wpb=433133, bsz=16373.1, num_updates=32300, lr=0.000351908, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=32078 epoch 020: 261 / 1689 loss=4.009, nll_loss=2.372, ppl=5.18, wps=458110, ups=1.06, wpb=433133, bsz=16373.1, num_updates=32300, lr=0.000351908, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=32078 epoch 020: 261 / 1689 loss=4.009, nll_loss=2.372, ppl=5.18, wps=458110, ups=1.06, wpb=433133, bsz=16373.1, num_updates=32300, lr=0.000351908, gnorm=0.208, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=32078 epoch 020: 361 / 1689 loss=4.003, nll_loss=2.366, ppl=5.15, wps=453242, ups=1.05, wpb=433225, bsz=16640.8, num_updates=32400, lr=0.000351364, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=32173 epoch 020: 361 / 1689 loss=4.003, nll_loss=2.366, ppl=5.15, wps=453242, ups=1.05, wpb=433225, bsz=16640.8, num_updates=32400, lr=0.000351364, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=32173 epoch 020: 361 / 1689 loss=4.003, nll_loss=2.366, ppl=5.15, wps=453242, ups=1.05, wpb=433225, bsz=16640.8, num_updates=32400, lr=0.000351364, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=32173 epoch 020: 361 / 1689 loss=4.003, nll_loss=2.366, ppl=5.15, wps=453242, ups=1.05, wpb=433225, bsz=16640.8, num_updates=32400, lr=0.000351364, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=32173 epoch 020: 361 / 1689 loss=4.003, nll_loss=2.366, ppl=5.15, wps=453242, ups=1.05, wpb=433225, bsz=16640.8, num_updates=32400, lr=0.000351364, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=32173 epoch 020: 361 / 1689 loss=4.003, nll_loss=2.366, ppl=5.15, wps=453242, ups=1.05, wpb=433225, bsz=16640.8, num_updates=32400, lr=0.000351364, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=32173 epoch 020: 361 / 1689 loss=4.003, nll_loss=2.366, ppl=5.15, wps=453242, ups=1.05, wpb=433225, bsz=16640.8, num_updates=32400, lr=0.000351364, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=32173 epoch 020: 361 / 1689 loss=4.003, nll_loss=2.366, ppl=5.15, wps=453242, ups=1.05, wpb=433225, bsz=16640.8, num_updates=32400, lr=0.000351364, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=32173 epoch 020: 361 / 1689 loss=4.003, nll_loss=2.366, ppl=5.15, wps=453242, ups=1.05, wpb=433225, bsz=16640.8, num_updates=32400, lr=0.000351364, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=32173 epoch 020: 361 / 1689 loss=4.003, nll_loss=2.366, ppl=5.15, wps=453242, ups=1.05, wpb=433225, bsz=16640.8, num_updates=32400, lr=0.000351364, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=32173 epoch 020: 361 / 1689 loss=4.003, nll_loss=2.366, ppl=5.15, wps=453242, ups=1.05, wpb=433225, bsz=16640.8, num_updates=32400, lr=0.000351364, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=32173 epoch 020: 361 / 1689 loss=4.003, nll_loss=2.366, ppl=5.15, wps=453242, ups=1.05, wpb=433225, bsz=16640.8, num_updates=32400, lr=0.000351364, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=32173 epoch 020: 361 / 1689 loss=4.003, nll_loss=2.366, ppl=5.15, wps=453242, ups=1.05, wpb=433225, bsz=16640.8, num_updates=32400, lr=0.000351364, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=32173 epoch 020: 361 / 1689 loss=4.003, nll_loss=2.366, ppl=5.15, wps=453242, ups=1.05, wpb=433225, bsz=16640.8, num_updates=32400, lr=0.000351364, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=32173 epoch 020: 361 / 1689 loss=4.003, nll_loss=2.366, ppl=5.15, wps=453242, ups=1.05, wpb=433225, bsz=16640.8, num_updates=32400, lr=0.000351364, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=32173 epoch 020: 361 / 1689 loss=4.003, nll_loss=2.366, ppl=5.15, wps=453242, ups=1.05, wpb=433225, bsz=16640.8, num_updates=32400, lr=0.000351364, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=32173 epoch 020: 361 / 1689 loss=4.003, nll_loss=2.366, ppl=5.15, wps=453242, ups=1.05, wpb=433225, bsz=16640.8, num_updates=32400, lr=0.000351364, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=32173 epoch 020: 361 / 1689 loss=4.003, nll_loss=2.366, ppl=5.15, wps=453242, ups=1.05, wpb=433225, bsz=16640.8, num_updates=32400, lr=0.000351364, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=32173 epoch 020: 361 / 1689 loss=4.003, nll_loss=2.366, ppl=5.15, wps=453242, ups=1.05, wpb=433225, bsz=16640.8, num_updates=32400, lr=0.000351364, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=32173 epoch 020: 361 / 1689 loss=4.003, nll_loss=2.366, ppl=5.15, wps=453242, ups=1.05, wpb=433225, bsz=16640.8, num_updates=32400, lr=0.000351364, gnorm=0.214, clip=0, loss_scale=1, train_wall=94, gb_free=19.4, wall=32173 epoch 020: 461 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=454940, ups=1.05, wpb=433680, bsz=16816.1, num_updates=32500, lr=0.000350823, gnorm=0.205, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=32269 epoch 020: 461 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=454940, ups=1.05, wpb=433680, bsz=16816.1, num_updates=32500, lr=0.000350823, gnorm=0.205, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=32269 epoch 020: 461 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=454940, ups=1.05, wpb=433680, bsz=16816.1, num_updates=32500, lr=0.000350823, gnorm=0.205, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=32269 epoch 020: 461 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=454940, ups=1.05, wpb=433680, bsz=16816.1, num_updates=32500, lr=0.000350823, gnorm=0.205, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=32269 epoch 020: 461 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=454940, ups=1.05, wpb=433680, bsz=16816.1, num_updates=32500, lr=0.000350823, gnorm=0.205, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=32269 epoch 020: 461 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=454940, ups=1.05, wpb=433680, bsz=16816.1, num_updates=32500, lr=0.000350823, gnorm=0.205, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=32269 epoch 020: 461 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=454940, ups=1.05, wpb=433680, bsz=16816.1, num_updates=32500, lr=0.000350823, gnorm=0.205, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=32269 epoch 020: 461 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=454940, ups=1.05, wpb=433680, bsz=16816.1, num_updates=32500, lr=0.000350823, gnorm=0.205, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=32269 epoch 020: 461 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=454940, ups=1.05, wpb=433680, bsz=16816.1, num_updates=32500, lr=0.000350823, gnorm=0.205, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=32269 epoch 020: 461 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=454940, ups=1.05, wpb=433680, bsz=16816.1, num_updates=32500, lr=0.000350823, gnorm=0.205, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=32269 epoch 020: 461 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=454940, ups=1.05, wpb=433680, bsz=16816.1, num_updates=32500, lr=0.000350823, gnorm=0.205, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=32269 epoch 020: 461 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=454940, ups=1.05, wpb=433680, bsz=16816.1, num_updates=32500, lr=0.000350823, gnorm=0.205, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=32269 epoch 020: 461 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=454940, ups=1.05, wpb=433680, bsz=16816.1, num_updates=32500, lr=0.000350823, gnorm=0.205, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=32269 epoch 020: 461 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=454940, ups=1.05, wpb=433680, bsz=16816.1, num_updates=32500, lr=0.000350823, gnorm=0.205, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=32269 epoch 020: 461 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=454940, ups=1.05, wpb=433680, bsz=16816.1, num_updates=32500, lr=0.000350823, gnorm=0.205, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=32269 epoch 020: 461 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=454940, ups=1.05, wpb=433680, bsz=16816.1, num_updates=32500, lr=0.000350823, gnorm=0.205, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=32269 epoch 020: 461 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=454940, ups=1.05, wpb=433680, bsz=16816.1, num_updates=32500, lr=0.000350823, gnorm=0.205, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=32269 epoch 020: 461 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=454940, ups=1.05, wpb=433680, bsz=16816.1, num_updates=32500, lr=0.000350823, gnorm=0.205, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=32269 epoch 020: 461 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=454940, ups=1.05, wpb=433680, bsz=16816.1, num_updates=32500, lr=0.000350823, gnorm=0.205, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=32269 epoch 020: 461 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=454940, ups=1.05, wpb=433680, bsz=16816.1, num_updates=32500, lr=0.000350823, gnorm=0.205, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=32269 epoch 020: 561 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=456657, ups=1.05, wpb=435530, bsz=16773, num_updates=32600, lr=0.000350285, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=32364 epoch 020: 561 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=456657, ups=1.05, wpb=435530, bsz=16773, num_updates=32600, lr=0.000350285, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=32364 epoch 020: 561 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=456657, ups=1.05, wpb=435530, bsz=16773, num_updates=32600, lr=0.000350285, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=32364 epoch 020: 561 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=456657, ups=1.05, wpb=435530, bsz=16773, num_updates=32600, lr=0.000350285, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=32364 epoch 020: 561 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=456657, ups=1.05, wpb=435530, bsz=16773, num_updates=32600, lr=0.000350285, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=32364 epoch 020: 561 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=456657, ups=1.05, wpb=435530, bsz=16773, num_updates=32600, lr=0.000350285, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=32364 epoch 020: 561 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=456657, ups=1.05, wpb=435530, bsz=16773, num_updates=32600, lr=0.000350285, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=32364 epoch 020: 561 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=456657, ups=1.05, wpb=435530, bsz=16773, num_updates=32600, lr=0.000350285, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=32364 epoch 020: 561 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=456657, ups=1.05, wpb=435530, bsz=16773, num_updates=32600, lr=0.000350285, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=32364 epoch 020: 561 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=456657, ups=1.05, wpb=435530, bsz=16773, num_updates=32600, lr=0.000350285, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=32364 epoch 020: 561 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=456657, ups=1.05, wpb=435530, bsz=16773, num_updates=32600, lr=0.000350285, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=32364 epoch 020: 561 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=456657, ups=1.05, wpb=435530, bsz=16773, num_updates=32600, lr=0.000350285, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=32364 epoch 020: 561 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=456657, ups=1.05, wpb=435530, bsz=16773, num_updates=32600, lr=0.000350285, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=32364 epoch 020: 561 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=456657, ups=1.05, wpb=435530, bsz=16773, num_updates=32600, lr=0.000350285, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=32364 epoch 020: 561 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=456657, ups=1.05, wpb=435530, bsz=16773, num_updates=32600, lr=0.000350285, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=32364 epoch 020: 561 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=456657, ups=1.05, wpb=435530, bsz=16773, num_updates=32600, lr=0.000350285, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=32364 epoch 020: 561 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=456657, ups=1.05, wpb=435530, bsz=16773, num_updates=32600, lr=0.000350285, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=32364 epoch 020: 561 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=456657, ups=1.05, wpb=435530, bsz=16773, num_updates=32600, lr=0.000350285, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=32364 epoch 020: 561 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=456657, ups=1.05, wpb=435530, bsz=16773, num_updates=32600, lr=0.000350285, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=32364 epoch 020: 561 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=456657, ups=1.05, wpb=435530, bsz=16773, num_updates=32600, lr=0.000350285, gnorm=0.21, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=32364 epoch 020: 661 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=461181, ups=1.06, wpb=434638, bsz=16425.8, num_updates=32700, lr=0.000349749, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=32458 epoch 020: 661 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=461181, ups=1.06, wpb=434638, bsz=16425.8, num_updates=32700, lr=0.000349749, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=32458 epoch 020: 661 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=461181, ups=1.06, wpb=434638, bsz=16425.8, num_updates=32700, lr=0.000349749, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=32458 epoch 020: 661 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=461181, ups=1.06, wpb=434638, bsz=16425.8, num_updates=32700, lr=0.000349749, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=32458 epoch 020: 661 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=461181, ups=1.06, wpb=434638, bsz=16425.8, num_updates=32700, lr=0.000349749, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=32458 epoch 020: 661 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=461181, ups=1.06, wpb=434638, bsz=16425.8, num_updates=32700, lr=0.000349749, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=32458 epoch 020: 661 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=461181, ups=1.06, wpb=434638, bsz=16425.8, num_updates=32700, lr=0.000349749, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=32458 epoch 020: 661 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=461181, ups=1.06, wpb=434638, bsz=16425.8, num_updates=32700, lr=0.000349749, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=32458 epoch 020: 661 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=461181, ups=1.06, wpb=434638, bsz=16425.8, num_updates=32700, lr=0.000349749, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=32458 epoch 020: 661 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=461181, ups=1.06, wpb=434638, bsz=16425.8, num_updates=32700, lr=0.000349749, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=32458 epoch 020: 661 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=461181, ups=1.06, wpb=434638, bsz=16425.8, num_updates=32700, lr=0.000349749, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=32458 epoch 020: 661 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=461181, ups=1.06, wpb=434638, bsz=16425.8, num_updates=32700, lr=0.000349749, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=32458 epoch 020: 661 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=461181, ups=1.06, wpb=434638, bsz=16425.8, num_updates=32700, lr=0.000349749, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=32458 epoch 020: 661 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=461181, ups=1.06, wpb=434638, bsz=16425.8, num_updates=32700, lr=0.000349749, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=32458 epoch 020: 661 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=461181, ups=1.06, wpb=434638, bsz=16425.8, num_updates=32700, lr=0.000349749, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=32458 epoch 020: 661 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=461181, ups=1.06, wpb=434638, bsz=16425.8, num_updates=32700, lr=0.000349749, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=32458 epoch 020: 661 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=461181, ups=1.06, wpb=434638, bsz=16425.8, num_updates=32700, lr=0.000349749, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=32458 epoch 020: 661 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=461181, ups=1.06, wpb=434638, bsz=16425.8, num_updates=32700, lr=0.000349749, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=32458 epoch 020: 661 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=461181, ups=1.06, wpb=434638, bsz=16425.8, num_updates=32700, lr=0.000349749, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=32458 epoch 020: 661 / 1689 loss=4.025, nll_loss=2.39, ppl=5.24, wps=461181, ups=1.06, wpb=434638, bsz=16425.8, num_updates=32700, lr=0.000349749, gnorm=0.229, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=32458 epoch 020: 762 / 1689 loss=4.029, nll_loss=2.396, ppl=5.26, wps=454403, ups=1.05, wpb=433268, bsz=16928.3, num_updates=32800, lr=0.000349215, gnorm=0.194, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=32554 epoch 020: 762 / 1689 loss=4.029, nll_loss=2.396, ppl=5.26, wps=454403, ups=1.05, wpb=433268, bsz=16928.3, num_updates=32800, lr=0.000349215, gnorm=0.194, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=32554 epoch 020: 762 / 1689 loss=4.029, nll_loss=2.396, ppl=5.26, wps=454403, ups=1.05, wpb=433268, bsz=16928.3, num_updates=32800, lr=0.000349215, gnorm=0.194, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=32554 epoch 020: 762 / 1689 loss=4.029, nll_loss=2.396, ppl=5.26, wps=454403, ups=1.05, wpb=433268, bsz=16928.3, num_updates=32800, lr=0.000349215, gnorm=0.194, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=32554 epoch 020: 762 / 1689 loss=4.029, nll_loss=2.396, ppl=5.26, wps=454403, ups=1.05, wpb=433268, bsz=16928.3, num_updates=32800, lr=0.000349215, gnorm=0.194, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=32554 epoch 020: 762 / 1689 loss=4.029, nll_loss=2.396, ppl=5.26, wps=454403, ups=1.05, wpb=433268, bsz=16928.3, num_updates=32800, lr=0.000349215, gnorm=0.194, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=32554 epoch 020: 762 / 1689 loss=4.029, nll_loss=2.396, ppl=5.26, wps=454403, ups=1.05, wpb=433268, bsz=16928.3, num_updates=32800, lr=0.000349215, gnorm=0.194, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=32554 epoch 020: 762 / 1689 loss=4.029, nll_loss=2.396, ppl=5.26, wps=454403, ups=1.05, wpb=433268, bsz=16928.3, num_updates=32800, lr=0.000349215, gnorm=0.194, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=32554 epoch 020: 762 / 1689 loss=4.029, nll_loss=2.396, ppl=5.26, wps=454403, ups=1.05, wpb=433268, bsz=16928.3, num_updates=32800, lr=0.000349215, gnorm=0.194, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=32554 epoch 020: 762 / 1689 loss=4.029, nll_loss=2.396, ppl=5.26, wps=454403, ups=1.05, wpb=433268, bsz=16928.3, num_updates=32800, lr=0.000349215, gnorm=0.194, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=32554 epoch 020: 762 / 1689 loss=4.029, nll_loss=2.396, ppl=5.26, wps=454403, ups=1.05, wpb=433268, bsz=16928.3, num_updates=32800, lr=0.000349215, gnorm=0.194, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=32554 epoch 020: 762 / 1689 loss=4.029, nll_loss=2.396, ppl=5.26, wps=454403, ups=1.05, wpb=433268, bsz=16928.3, num_updates=32800, lr=0.000349215, gnorm=0.194, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=32554 epoch 020: 762 / 1689 loss=4.029, nll_loss=2.396, ppl=5.26, wps=454403, ups=1.05, wpb=433268, bsz=16928.3, num_updates=32800, lr=0.000349215, gnorm=0.194, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=32554 epoch 020: 762 / 1689 loss=4.029, nll_loss=2.396, ppl=5.26, wps=454403, ups=1.05, wpb=433268, bsz=16928.3, num_updates=32800, lr=0.000349215, gnorm=0.194, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=32554 epoch 020: 762 / 1689 loss=4.029, nll_loss=2.396, ppl=5.26, wps=454403, ups=1.05, wpb=433268, bsz=16928.3, num_updates=32800, lr=0.000349215, gnorm=0.194, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=32554 epoch 020: 762 / 1689 loss=4.029, nll_loss=2.396, ppl=5.26, wps=454403, ups=1.05, wpb=433268, bsz=16928.3, num_updates=32800, lr=0.000349215, gnorm=0.194, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=32554 epoch 020: 762 / 1689 loss=4.029, nll_loss=2.396, ppl=5.26, wps=454403, ups=1.05, wpb=433268, bsz=16928.3, num_updates=32800, lr=0.000349215, gnorm=0.194, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=32554 epoch 020: 762 / 1689 loss=4.029, nll_loss=2.396, ppl=5.26, wps=454403, ups=1.05, wpb=433268, bsz=16928.3, num_updates=32800, lr=0.000349215, gnorm=0.194, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=32554 epoch 020: 762 / 1689 loss=4.029, nll_loss=2.396, ppl=5.26, wps=454403, ups=1.05, wpb=433268, bsz=16928.3, num_updates=32800, lr=0.000349215, gnorm=0.194, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=32554 epoch 020: 762 / 1689 loss=4.029, nll_loss=2.396, ppl=5.26, wps=454403, ups=1.05, wpb=433268, bsz=16928.3, num_updates=32800, lr=0.000349215, gnorm=0.194, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=32554 epoch 020: 863 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=453619, ups=1.05, wpb=433247, bsz=16447, num_updates=32900, lr=0.000348684, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.1, wall=32649 epoch 020: 863 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=453619, ups=1.05, wpb=433247, bsz=16447, num_updates=32900, lr=0.000348684, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.1, wall=32649 epoch 020: 863 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=453619, ups=1.05, wpb=433247, bsz=16447, num_updates=32900, lr=0.000348684, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.1, wall=32649 epoch 020: 863 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=453619, ups=1.05, wpb=433247, bsz=16447, num_updates=32900, lr=0.000348684, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.1, wall=32649 epoch 020: 863 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=453619, ups=1.05, wpb=433247, bsz=16447, num_updates=32900, lr=0.000348684, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.1, wall=32649 epoch 020: 863 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=453619, ups=1.05, wpb=433247, bsz=16447, num_updates=32900, lr=0.000348684, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.1, wall=32649 epoch 020: 863 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=453619, ups=1.05, wpb=433247, bsz=16447, num_updates=32900, lr=0.000348684, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.1, wall=32649 epoch 020: 863 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=453619, ups=1.05, wpb=433247, bsz=16447, num_updates=32900, lr=0.000348684, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.1, wall=32649 epoch 020: 863 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=453619, ups=1.05, wpb=433247, bsz=16447, num_updates=32900, lr=0.000348684, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.1, wall=32649 epoch 020: 863 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=453619, ups=1.05, wpb=433247, bsz=16447, num_updates=32900, lr=0.000348684, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.1, wall=32649 epoch 020: 863 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=453619, ups=1.05, wpb=433247, bsz=16447, num_updates=32900, lr=0.000348684, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.1, wall=32649 epoch 020: 863 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=453619, ups=1.05, wpb=433247, bsz=16447, num_updates=32900, lr=0.000348684, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.1, wall=32649 epoch 020: 863 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=453619, ups=1.05, wpb=433247, bsz=16447, num_updates=32900, lr=0.000348684, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.1, wall=32649 epoch 020: 863 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=453619, ups=1.05, wpb=433247, bsz=16447, num_updates=32900, lr=0.000348684, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.1, wall=32649 epoch 020: 863 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=453619, ups=1.05, wpb=433247, bsz=16447, num_updates=32900, lr=0.000348684, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.1, wall=32649 epoch 020: 863 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=453619, ups=1.05, wpb=433247, bsz=16447, num_updates=32900, lr=0.000348684, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.1, wall=32649 epoch 020: 863 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=453619, ups=1.05, wpb=433247, bsz=16447, num_updates=32900, lr=0.000348684, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.1, wall=32649 epoch 020: 863 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=453619, ups=1.05, wpb=433247, bsz=16447, num_updates=32900, lr=0.000348684, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.1, wall=32649 epoch 020: 863 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=453619, ups=1.05, wpb=433247, bsz=16447, num_updates=32900, lr=0.000348684, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.1, wall=32649 epoch 020: 863 / 1689 loss=4.02, nll_loss=2.385, ppl=5.22, wps=453619, ups=1.05, wpb=433247, bsz=16447, num_updates=32900, lr=0.000348684, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=95, gb_free=19.1, wall=32649 epoch 020: 963 / 1689 loss=4.018, nll_loss=2.383, ppl=5.21, wps=461323, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=32743 epoch 020: 963 / 1689 loss=4.018, nll_loss=2.383, ppl=5.21, wps=461323, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=32743 epoch 020: 963 / 1689 loss=4.018, nll_loss=2.383, ppl=5.21, wps=461323, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=32743 epoch 020: 963 / 1689 loss=4.018, nll_loss=2.383, ppl=5.21, wps=461323, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=32743 epoch 020: 963 / 1689 loss=4.018, nll_loss=2.383, ppl=5.21, wps=461323, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=32743 epoch 020: 963 / 1689 loss=4.018, nll_loss=2.383, ppl=5.21, wps=461323, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=32743 epoch 020: 963 / 1689 loss=4.018, nll_loss=2.383, ppl=5.21, wps=461323, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=32743 epoch 020: 963 / 1689 loss=4.018, nll_loss=2.383, ppl=5.21, wps=461323, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=32743 epoch 020: 963 / 1689 loss=4.018, nll_loss=2.383, ppl=5.21, wps=461323, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=32743 epoch 020: 963 / 1689 loss=4.018, nll_loss=2.383, ppl=5.21, wps=461323, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=32743 epoch 020: 963 / 1689 loss=4.018, nll_loss=2.383, ppl=5.21, wps=461323, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=32743 epoch 020: 963 / 1689 loss=4.018, nll_loss=2.383, ppl=5.21, wps=461323, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=32743 epoch 020: 963 / 1689 loss=4.018, nll_loss=2.383, ppl=5.21, wps=461323, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=32743 epoch 020: 963 / 1689 loss=4.018, nll_loss=2.383, ppl=5.21, wps=461323, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=32743 epoch 020: 963 / 1689 loss=4.018, nll_loss=2.383, ppl=5.21, wps=461323, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=32743 epoch 020: 963 / 1689 loss=4.018, nll_loss=2.383, ppl=5.21, wps=461323, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=32743 epoch 020: 963 / 1689 loss=4.018, nll_loss=2.383, ppl=5.21, wps=461323, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=32743 epoch 020: 963 / 1689 loss=4.018, nll_loss=2.383, ppl=5.21, wps=461323, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=32743 epoch 020: 963 / 1689 loss=4.018, nll_loss=2.383, ppl=5.21, wps=461323, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=32743 epoch 020: 963 / 1689 loss=4.018, nll_loss=2.383, ppl=5.21, wps=461323, ups=1.07, wpb=431716, bsz=16479.9, num_updates=33000, lr=0.000348155, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=32743 begin validation on "valid" subset epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.249 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.249 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.249 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.249 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.249 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.249 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.249 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.249 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.249 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.249 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.249 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.249 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.249 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.249 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.249 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.249 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.249 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.249 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.249 epoch 020 | valid on 'valid' subset | loss 4.255 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 33000 | best_loss 4.249 epoch 020: 1063 / 1689 loss=4.031, nll_loss=2.398, ppl=5.27, wps=408476, ups=0.94, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=32849 epoch 020: 1063 / 1689 loss=4.031, nll_loss=2.398, ppl=5.27, wps=408476, ups=0.94, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=32849 epoch 020: 1063 / 1689 loss=4.031, nll_loss=2.398, ppl=5.27, wps=408476, ups=0.94, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=32849 epoch 020: 1063 / 1689 loss=4.031, nll_loss=2.398, ppl=5.27, wps=408476, ups=0.94, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=32849 epoch 020: 1063 / 1689 loss=4.031, nll_loss=2.398, ppl=5.27, wps=408476, ups=0.94, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=32849 epoch 020: 1063 / 1689 loss=4.031, nll_loss=2.398, ppl=5.27, wps=408476, ups=0.94, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=32849 epoch 020: 1063 / 1689 loss=4.031, nll_loss=2.398, ppl=5.27, wps=408476, ups=0.94, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=32849 epoch 020: 1063 / 1689 loss=4.031, nll_loss=2.398, ppl=5.27, wps=408476, ups=0.94, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=32849 epoch 020: 1063 / 1689 loss=4.031, nll_loss=2.398, ppl=5.27, wps=408476, ups=0.94, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=32849 epoch 020: 1063 / 1689 loss=4.031, nll_loss=2.398, ppl=5.27, wps=408476, ups=0.94, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=32849 epoch 020: 1063 / 1689 loss=4.031, nll_loss=2.398, ppl=5.27, wps=408476, ups=0.94, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=32849 epoch 020: 1063 / 1689 loss=4.031, nll_loss=2.398, ppl=5.27, wps=408476, ups=0.94, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=32849 epoch 020: 1063 / 1689 loss=4.031, nll_loss=2.398, ppl=5.27, wps=408476, ups=0.94, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=32849 epoch 020: 1063 / 1689 loss=4.031, nll_loss=2.398, ppl=5.27, wps=408476, ups=0.94, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=32849 epoch 020: 1063 / 1689 loss=4.031, nll_loss=2.398, ppl=5.27, wps=408476, ups=0.94, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=32849 epoch 020: 1063 / 1689 loss=4.031, nll_loss=2.398, ppl=5.27, wps=408476, ups=0.94, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=32849 epoch 020: 1063 / 1689 loss=4.031, nll_loss=2.398, ppl=5.27, wps=408476, ups=0.94, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=32849 epoch 020: 1063 / 1689 loss=4.031, nll_loss=2.398, ppl=5.27, wps=408476, ups=0.94, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=32849 epoch 020: 1063 / 1689 loss=4.031, nll_loss=2.398, ppl=5.27, wps=408476, ups=0.94, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=32849 epoch 020: 1063 / 1689 loss=4.031, nll_loss=2.398, ppl=5.27, wps=408476, ups=0.94, wpb=433264, bsz=16273.4, num_updates=33100, lr=0.000347629, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=32849 epoch 020: 1163 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=461312, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=32943 epoch 020: 1163 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=461312, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=32943 epoch 020: 1163 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=461312, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=32943 epoch 020: 1163 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=461312, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=32943 epoch 020: 1163 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=461312, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=32943 epoch 020: 1163 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=461312, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=32943 epoch 020: 1163 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=461312, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=32943 epoch 020: 1163 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=461312, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=32943 epoch 020: 1163 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=461312, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=32943 epoch 020: 1163 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=461312, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=32943 epoch 020: 1163 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=461312, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=32943 epoch 020: 1163 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=461312, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=32943 epoch 020: 1163 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=461312, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=32943 epoch 020: 1163 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=461312, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=32943 epoch 020: 1163 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=461312, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=32943 epoch 020: 1163 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=461312, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=32943 epoch 020: 1163 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=461312, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=32943 epoch 020: 1163 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=461312, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=32943 epoch 020: 1163 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=461312, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=32943 epoch 020: 1163 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=461312, ups=1.06, wpb=435655, bsz=16538.6, num_updates=33200, lr=0.000347105, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=32943 epoch 020: 1263 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462542, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33037 epoch 020: 1263 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462542, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33037 epoch 020: 1263 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462542, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33037 epoch 020: 1263 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462542, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33037 epoch 020: 1263 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462542, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33037 epoch 020: 1263 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462542, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33037 epoch 020: 1263 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462542, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33037 epoch 020: 1263 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462542, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33037 epoch 020: 1263 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462542, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33037 epoch 020: 1263 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462542, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33037 epoch 020: 1263 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462542, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33037 epoch 020: 1263 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462542, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33037 epoch 020: 1263 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462542, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33037 epoch 020: 1263 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462542, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33037 epoch 020: 1263 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462542, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33037 epoch 020: 1263 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462542, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33037 epoch 020: 1263 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462542, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33037 epoch 020: 1263 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462542, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33037 epoch 020: 1263 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462542, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33037 epoch 020: 1263 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462542, ups=1.06, wpb=435096, bsz=16563.5, num_updates=33300, lr=0.000346583, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33037 epoch 020: 1363 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=465338, ups=1.08, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33130 epoch 020: 1363 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=465338, ups=1.08, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33130 epoch 020: 1363 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=465338, ups=1.08, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33130 epoch 020: 1363 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=465338, ups=1.08, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33130 epoch 020: 1363 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=465338, ups=1.08, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33130 epoch 020: 1363 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=465338, ups=1.08, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33130 epoch 020: 1363 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=465338, ups=1.08, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33130 epoch 020: 1363 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=465338, ups=1.08, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33130 epoch 020: 1363 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=465338, ups=1.08, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33130 epoch 020: 1363 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=465338, ups=1.08, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33130 epoch 020: 1363 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=465338, ups=1.08, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33130 epoch 020: 1363 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=465338, ups=1.08, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33130 epoch 020: 1363 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=465338, ups=1.08, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33130 epoch 020: 1363 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=465338, ups=1.08, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33130 epoch 020: 1363 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=465338, ups=1.08, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33130 epoch 020: 1363 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=465338, ups=1.08, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33130 epoch 020: 1363 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=465338, ups=1.08, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33130 epoch 020: 1363 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=465338, ups=1.08, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33130 epoch 020: 1363 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=465338, ups=1.08, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33130 epoch 020: 1363 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=465338, ups=1.08, wpb=432693, bsz=16370.2, num_updates=33400, lr=0.000346064, gnorm=0.203, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=33130 epoch 020: 1463 / 1689 loss=4.036, nll_loss=2.404, ppl=5.29, wps=457802, ups=1.06, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=33225 epoch 020: 1463 / 1689 loss=4.036, nll_loss=2.404, ppl=5.29, wps=457802, ups=1.06, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=33225 epoch 020: 1463 / 1689 loss=4.036, nll_loss=2.404, ppl=5.29, wps=457802, ups=1.06, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=33225 epoch 020: 1463 / 1689 loss=4.036, nll_loss=2.404, ppl=5.29, wps=457802, ups=1.06, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=33225 epoch 020: 1463 / 1689 loss=4.036, nll_loss=2.404, ppl=5.29, wps=457802, ups=1.06, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=33225 epoch 020: 1463 / 1689 loss=4.036, nll_loss=2.404, ppl=5.29, wps=457802, ups=1.06, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=33225 epoch 020: 1463 / 1689 loss=4.036, nll_loss=2.404, ppl=5.29, wps=457802, ups=1.06, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=33225 epoch 020: 1463 / 1689 loss=4.036, nll_loss=2.404, ppl=5.29, wps=457802, ups=1.06, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=33225 epoch 020: 1463 / 1689 loss=4.036, nll_loss=2.404, ppl=5.29, wps=457802, ups=1.06, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=33225 epoch 020: 1463 / 1689 loss=4.036, nll_loss=2.404, ppl=5.29, wps=457802, ups=1.06, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=33225 epoch 020: 1463 / 1689 loss=4.036, nll_loss=2.404, ppl=5.29, wps=457802, ups=1.06, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=33225 epoch 020: 1463 / 1689 loss=4.036, nll_loss=2.404, ppl=5.29, wps=457802, ups=1.06, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=33225 epoch 020: 1463 / 1689 loss=4.036, nll_loss=2.404, ppl=5.29, wps=457802, ups=1.06, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=33225 epoch 020: 1463 / 1689 loss=4.036, nll_loss=2.404, ppl=5.29, wps=457802, ups=1.06, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=33225 epoch 020: 1463 / 1689 loss=4.036, nll_loss=2.404, ppl=5.29, wps=457802, ups=1.06, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=33225 epoch 020: 1463 / 1689 loss=4.036, nll_loss=2.404, ppl=5.29, wps=457802, ups=1.06, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=33225 epoch 020: 1463 / 1689 loss=4.036, nll_loss=2.404, ppl=5.29, wps=457802, ups=1.06, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=33225 epoch 020: 1463 / 1689 loss=4.036, nll_loss=2.404, ppl=5.29, wps=457802, ups=1.06, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=33225 epoch 020: 1463 / 1689 loss=4.036, nll_loss=2.404, ppl=5.29, wps=457802, ups=1.06, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=33225 epoch 020: 1463 / 1689 loss=4.036, nll_loss=2.404, ppl=5.29, wps=457802, ups=1.06, wpb=432798, bsz=16465.8, num_updates=33500, lr=0.000345547, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=19.5, wall=33225 epoch 020: 1563 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462577, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=33318 epoch 020: 1563 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462577, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=33318 epoch 020: 1563 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462577, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=33318 epoch 020: 1563 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462577, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=33318 epoch 020: 1563 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462577, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=33318 epoch 020: 1563 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462577, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=33318 epoch 020: 1563 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462577, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=33318 epoch 020: 1563 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462577, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=33318 epoch 020: 1563 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462577, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=33318 epoch 020: 1563 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462577, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=33318 epoch 020: 1563 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462577, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=33318 epoch 020: 1563 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462577, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=33318 epoch 020: 1563 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462577, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=33318 epoch 020: 1563 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462577, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=33318 epoch 020: 1563 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462577, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=33318 epoch 020: 1563 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462577, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=33318 epoch 020: 1563 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462577, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=33318 epoch 020: 1563 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462577, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=33318 epoch 020: 1563 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462577, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=33318 epoch 020: 1563 / 1689 loss=4.034, nll_loss=2.401, ppl=5.28, wps=462577, ups=1.07, wpb=431962, bsz=16279.7, num_updates=33600, lr=0.000345033, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=33318 epoch 020: 1663 / 1689 loss=4.042, nll_loss=2.411, ppl=5.32, wps=464247, ups=1.06, wpb=436556, bsz=16606.3, num_updates=33700, lr=0.00034452, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=33412 epoch 020: 1663 / 1689 loss=4.042, nll_loss=2.411, ppl=5.32, wps=464247, ups=1.06, wpb=436556, bsz=16606.3, num_updates=33700, lr=0.00034452, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=33412 epoch 020: 1663 / 1689 loss=4.042, nll_loss=2.411, ppl=5.32, wps=464247, ups=1.06, wpb=436556, bsz=16606.3, num_updates=33700, lr=0.00034452, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=33412 epoch 020: 1663 / 1689 loss=4.042, nll_loss=2.411, ppl=5.32, wps=464247, ups=1.06, wpb=436556, bsz=16606.3, num_updates=33700, lr=0.00034452, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=33412 epoch 020: 1663 / 1689 loss=4.042, nll_loss=2.411, ppl=5.32, wps=464247, ups=1.06, wpb=436556, bsz=16606.3, num_updates=33700, lr=0.00034452, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=33412 epoch 020: 1663 / 1689 loss=4.042, nll_loss=2.411, ppl=5.32, wps=464247, ups=1.06, wpb=436556, bsz=16606.3, num_updates=33700, lr=0.00034452, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=33412 epoch 020: 1663 / 1689 loss=4.042, nll_loss=2.411, ppl=5.32, wps=464247, ups=1.06, wpb=436556, bsz=16606.3, num_updates=33700, lr=0.00034452, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=33412 epoch 020: 1663 / 1689 loss=4.042, nll_loss=2.411, ppl=5.32, wps=464247, ups=1.06, wpb=436556, bsz=16606.3, num_updates=33700, lr=0.00034452, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=33412 epoch 020: 1663 / 1689 loss=4.042, nll_loss=2.411, ppl=5.32, wps=464247, ups=1.06, wpb=436556, bsz=16606.3, num_updates=33700, lr=0.00034452, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=33412 epoch 020: 1663 / 1689 loss=4.042, nll_loss=2.411, ppl=5.32, wps=464247, ups=1.06, wpb=436556, bsz=16606.3, num_updates=33700, lr=0.00034452, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=33412 epoch 020: 1663 / 1689 loss=4.042, nll_loss=2.411, ppl=5.32, wps=464247, ups=1.06, wpb=436556, bsz=16606.3, num_updates=33700, lr=0.00034452, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=33412 epoch 020: 1663 / 1689 loss=4.042, nll_loss=2.411, ppl=5.32, wps=464247, ups=1.06, wpb=436556, bsz=16606.3, num_updates=33700, lr=0.00034452, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=33412 epoch 020: 1663 / 1689 loss=4.042, nll_loss=2.411, ppl=5.32, wps=464247, ups=1.06, wpb=436556, bsz=16606.3, num_updates=33700, lr=0.00034452, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=33412 epoch 020: 1663 / 1689 loss=4.042, nll_loss=2.411, ppl=5.32, wps=464247, ups=1.06, wpb=436556, bsz=16606.3, num_updates=33700, lr=0.00034452, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=33412 epoch 020: 1663 / 1689 loss=4.042, nll_loss=2.411, ppl=5.32, wps=464247, ups=1.06, wpb=436556, bsz=16606.3, num_updates=33700, lr=0.00034452, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=33412 epoch 020: 1663 / 1689 loss=4.042, nll_loss=2.411, ppl=5.32, wps=464247, ups=1.06, wpb=436556, bsz=16606.3, num_updates=33700, lr=0.00034452, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=33412 epoch 020: 1663 / 1689 loss=4.042, nll_loss=2.411, ppl=5.32, wps=464247, ups=1.06, wpb=436556, bsz=16606.3, num_updates=33700, lr=0.00034452, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=33412 epoch 020: 1663 / 1689 loss=4.042, nll_loss=2.411, ppl=5.32, wps=464247, ups=1.06, wpb=436556, bsz=16606.3, num_updates=33700, lr=0.00034452, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=33412 epoch 020: 1663 / 1689 loss=4.042, nll_loss=2.411, ppl=5.32, wps=464247, ups=1.06, wpb=436556, bsz=16606.3, num_updates=33700, lr=0.00034452, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=33412 epoch 020: 1663 / 1689 loss=4.042, nll_loss=2.411, ppl=5.32, wps=464247, ups=1.06, wpb=436556, bsz=16606.3, num_updates=33700, lr=0.00034452, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=33412 end of epoch 20 (average epoch stats below) epoch 020 | loss 4.024 | nll_loss 2.389 | ppl 5.24 | wps 455611 | ups 1.05 | wpb 433525 | bsz 16505.3 | num_updates 33726 | lr 0.000344388 | gnorm 0.21 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 20 | wall 33435 epoch 020 | loss 4.024 | nll_loss 2.389 | ppl 5.24 | wps 455611 | ups 1.05 | wpb 433525 | bsz 16505.3 | num_updates 33726 | lr 0.000344388 | gnorm 0.21 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 20 | wall 33435 epoch 020 | loss 4.024 | nll_loss 2.389 | ppl 5.24 | wps 455611 | ups 1.05 | wpb 433525 | bsz 16505.3 | num_updates 33726 | lr 0.000344388 | gnorm 0.21 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 20 | wall 33435 epoch 020 | loss 4.024 | nll_loss 2.389 | ppl 5.24 | wps 455611 | ups 1.05 | wpb 433525 | bsz 16505.3 | num_updates 33726 | lr 0.000344388 | gnorm 0.21 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 20 | wall 33435 epoch 020 | loss 4.024 | nll_loss 2.389 | ppl 5.24 | wps 455611 | ups 1.05 | wpb 433525 | bsz 16505.3 | num_updates 33726 | lr 0.000344388 | gnorm 0.21 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 20 | wall 33435 epoch 020 | loss 4.024 | nll_loss 2.389 | ppl 5.24 | wps 455611 | ups 1.05 | wpb 433525 | bsz 16505.3 | num_updates 33726 | lr 0.000344388 | gnorm 0.21 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 20 | wall 33435 epoch 020 | loss 4.024 | nll_loss 2.389 | ppl 5.24 | wps 455611 | ups 1.05 | wpb 433525 | bsz 16505.3 | num_updates 33726 | lr 0.000344388 | gnorm 0.21 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 20 | wall 33435 epoch 020 | loss 4.024 | nll_loss 2.389 | ppl 5.24 | wps 455611 | ups 1.05 | wpb 433525 | bsz 16505.3 | num_updates 33726 | lr 0.000344388 | gnorm 0.21 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 20 | wall 33435 epoch 020 | loss 4.024 | nll_loss 2.389 | ppl 5.24 | wps 455611 | ups 1.05 | wpb 433525 | bsz 16505.3 | num_updates 33726 | lr 0.000344388 | gnorm 0.21 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 20 | wall 33435 epoch 020 | loss 4.024 | nll_loss 2.389 | ppl 5.24 | wps 455611 | ups 1.05 | wpb 433525 | bsz 16505.3 | num_updates 33726 | lr 0.000344388 | gnorm 0.21 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 20 | wall 33435 epoch 020 | loss 4.024 | nll_loss 2.389 | ppl 5.24 | wps 455611 | ups 1.05 | wpb 433525 | bsz 16505.3 | num_updates 33726 | lr 0.000344388 | gnorm 0.21 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 20 | wall 33435 epoch 020 | loss 4.024 | nll_loss 2.389 | ppl 5.24 | wps 455611 | ups 1.05 | wpb 433525 | bsz 16505.3 | num_updates 33726 | lr 0.000344388 | gnorm 0.21 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 20 | wall 33435 epoch 020 | loss 4.024 | nll_loss 2.389 | ppl 5.24 | wps 455611 | ups 1.05 | wpb 433525 | bsz 16505.3 | num_updates 33726 | lr 0.000344388 | gnorm 0.21 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 20 | wall 33435 epoch 020 | loss 4.024 | nll_loss 2.389 | ppl 5.24 | wps 455611 | ups 1.05 | wpb 433525 | bsz 16505.3 | num_updates 33726 | lr 0.000344388 | gnorm 0.21 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 20 | wall 33435 epoch 020 | loss 4.024 | nll_loss 2.389 | ppl 5.24 | wps 455611 | ups 1.05 | wpb 433525 | bsz 16505.3 | num_updates 33726 | lr 0.000344388 | gnorm 0.21 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 20 | wall 33435 epoch 020 | loss 4.024 | nll_loss 2.389 | ppl 5.24 | wps 455611 | ups 1.05 | wpb 433525 | bsz 16505.3 | num_updates 33726 | lr 0.000344388 | gnorm 0.21 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 20 | wall 33435 epoch 020 | loss 4.024 | nll_loss 2.389 | ppl 5.24 | wps 455611 | ups 1.05 | wpb 433525 | bsz 16505.3 | num_updates 33726 | lr 0.000344388 | gnorm 0.21 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 20 | wall 33435 epoch 020 | loss 4.024 | nll_loss 2.389 | ppl 5.24 | wps 455611 | ups 1.05 | wpb 433525 | bsz 16505.3 | num_updates 33726 | lr 0.000344388 | gnorm 0.21 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 20 | wall 33435 epoch 020 | loss 4.024 | nll_loss 2.389 | ppl 5.24 | wps 455611 | ups 1.05 | wpb 433525 | bsz 16505.3 | num_updates 33726 | lr 0.000344388 | gnorm 0.21 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 20 | wall 33435 epoch 020 | loss 4.024 | nll_loss 2.389 | ppl 5.24 | wps 455611 | ups 1.05 | wpb 433525 | bsz 16505.3 | num_updates 33726 | lr 0.000344388 | gnorm 0.21 | clip 0 | loss_scale 1 | train_wall 1569 | gb_free 20 | wall 33435 Start iterating over samples epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 74 / 1689 loss=4.002, nll_loss=2.364, ppl=5.15, wps=456068, ups=1.06, wpb=428703, bsz=16273.5, num_updates=33800, lr=0.00034401, gnorm=0.214, clip=0, loss_scale=1, train_wall=91, gb_free=18.4, wall=33506 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 175 / 1689 loss=3.995, nll_loss=2.356, ppl=5.12, wps=454270, ups=1.05, wpb=433051, bsz=16516.3, num_updates=33900, lr=0.000343503, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=94, gb_free=16.7, wall=33602 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 epoch 021: 275 / 1689 loss=3.997, nll_loss=2.359, ppl=5.13, wps=458521, ups=1.06, wpb=432513, bsz=16583.9, num_updates=34000, lr=0.000342997, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=33696 begin validation on "valid" subset epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 34000 | best_loss 4.249 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 375 / 1689 loss=4.006, nll_loss=2.369, ppl=5.16, wps=409103, ups=0.94, wpb=432944, bsz=16796.1, num_updates=34100, lr=0.000342494, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=33802 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 475 / 1689 loss=4.013, nll_loss=2.377, ppl=5.2, wps=458628, ups=1.06, wpb=431687, bsz=16659.4, num_updates=34200, lr=0.000341993, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=33896 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 575 / 1689 loss=4.023, nll_loss=2.388, ppl=5.24, wps=461883, ups=1.06, wpb=437674, bsz=16500.2, num_updates=34300, lr=0.000341494, gnorm=0.198, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=33991 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 676 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=449455, ups=1.04, wpb=432606, bsz=16406.9, num_updates=34400, lr=0.000340997, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.8, wall=34087 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 776 / 1689 loss=4.021, nll_loss=2.386, ppl=5.23, wps=455309, ups=1.05, wpb=433320, bsz=16587.5, num_updates=34500, lr=0.000340503, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.9, wall=34182 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 876 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=459768, ups=1.06, wpb=433310, bsz=16338, num_updates=34600, lr=0.00034001, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=34276 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 976 / 1689 loss=4.018, nll_loss=2.383, ppl=5.22, wps=458577, ups=1.06, wpb=433116, bsz=16337.1, num_updates=34700, lr=0.00033952, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=34371 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1076 / 1689 loss=4.014, nll_loss=2.378, ppl=5.2, wps=462570, ups=1.06, wpb=436360, bsz=16401.1, num_updates=34800, lr=0.000339032, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=34465 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1176 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=456393, ups=1.05, wpb=433201, bsz=16832.9, num_updates=34900, lr=0.000338546, gnorm=0.215, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=34560 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 epoch 021: 1276 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=462271, ups=1.06, wpb=435732, bsz=16523.5, num_updates=35000, lr=0.000338062, gnorm=0.196, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=34654 begin validation on "valid" subset epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021 | valid on 'valid' subset | loss 4.255 | nll_loss 2.637 | ppl 6.22 | wps 0 | wpb 42662 | bsz 2032 | num_updates 35000 | best_loss 4.249 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1377 / 1689 loss=4.027, nll_loss=2.394, ppl=5.26, wps=403518, ups=0.93, wpb=433580, bsz=16186.2, num_updates=35100, lr=0.00033758, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=34762 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1477 / 1689 loss=4.03, nll_loss=2.397, ppl=5.27, wps=459373, ups=1.06, wpb=432880, bsz=16354.8, num_updates=35200, lr=0.0003371, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=34856 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1577 / 1689 loss=4.02, nll_loss=2.386, ppl=5.23, wps=462407, ups=1.07, wpb=434126, bsz=16494.2, num_updates=35300, lr=0.000336622, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=34950 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 epoch 021: 1677 / 1689 loss=4.023, nll_loss=2.389, ppl=5.24, wps=461795, ups=1.06, wpb=434118, bsz=16610.2, num_updates=35400, lr=0.000336146, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.2, wall=35044 end of epoch 21 (average epoch stats below) epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 epoch 021 | loss 4.016 | nll_loss 2.38 | ppl 5.21 | wps 451441 | ups 1.04 | wpb 433508 | bsz 16503.8 | num_updates 35412 | lr 0.000336089 | gnorm 0.21 | clip 0 | loss_scale 0.5 | train_wall 1567 | gb_free 20.2 | wall 35054 Start iterating over samples epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 88 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=456292, ups=1.06, wpb=430578, bsz=16346.3, num_updates=35500, lr=0.000335673, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.5, wall=35138 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 188 / 1689 loss=3.985, nll_loss=2.345, ppl=5.08, wps=457240, ups=1.06, wpb=431102, bsz=16354.3, num_updates=35600, lr=0.000335201, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=20.6, wall=35232 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 288 / 1689 loss=4.003, nll_loss=2.365, ppl=5.15, wps=460241, ups=1.06, wpb=433298, bsz=16536.7, num_updates=35700, lr=0.000334731, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=35327 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 388 / 1689 loss=4.001, nll_loss=2.363, ppl=5.15, wps=461418, ups=1.06, wpb=434155, bsz=16641, num_updates=35800, lr=0.000334263, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=35421 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 488 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=455311, ups=1.05, wpb=434347, bsz=16599.3, num_updates=35900, lr=0.000333797, gnorm=0.197, clip=0, loss_scale=1, train_wall=94, gb_free=18.9, wall=35516 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 epoch 022: 588 / 1689 loss=4.003, nll_loss=2.366, ppl=5.16, wps=460051, ups=1.06, wpb=434075, bsz=16644.7, num_updates=36000, lr=0.000333333, gnorm=0.231, clip=0, loss_scale=1, train_wall=93, gb_free=19.3, wall=35610 begin validation on "valid" subset epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.272 | nll_loss 2.648 | ppl 6.27 | wps 0 | wpb 42662 | bsz 2032 | num_updates 36000 | best_loss 4.249 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 689 / 1689 loss=3.998, nll_loss=2.36, ppl=5.14, wps=400943, ups=0.92, wpb=435245, bsz=16454.1, num_updates=36100, lr=0.000332871, gnorm=0.208, clip=0, loss_scale=1, train_wall=94, gb_free=19.2, wall=35719 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 789 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=460167, ups=1.06, wpb=433416, bsz=16253, num_updates=36200, lr=0.000332411, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=35813 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 889 / 1689 loss=4.012, nll_loss=2.376, ppl=5.19, wps=456362, ups=1.05, wpb=434549, bsz=16742.6, num_updates=36300, lr=0.000331953, gnorm=0.218, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=35908 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 989 / 1689 loss=4.015, nll_loss=2.38, ppl=5.21, wps=461593, ups=1.06, wpb=435354, bsz=16539, num_updates=36400, lr=0.000331497, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=36003 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1089 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=460081, ups=1.06, wpb=435440, bsz=16541.4, num_updates=36500, lr=0.000331042, gnorm=0.207, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=36097 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1190 / 1689 loss=4.024, nll_loss=2.39, ppl=5.24, wps=455824, ups=1.05, wpb=433793, bsz=16395.6, num_updates=36600, lr=0.00033059, gnorm=0.213, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=36193 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1290 / 1689 loss=4.012, nll_loss=2.377, ppl=5.19, wps=458746, ups=1.06, wpb=432234, bsz=16551, num_updates=36700, lr=0.000330139, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=36287 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1391 / 1689 loss=4.018, nll_loss=2.384, ppl=5.22, wps=457210, ups=1.05, wpb=434553, bsz=16805.4, num_updates=36800, lr=0.00032969, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=36382 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1491 / 1689 loss=4.015, nll_loss=2.38, ppl=5.2, wps=459691, ups=1.06, wpb=433428, bsz=16281.7, num_updates=36900, lr=0.000329243, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=36476 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 epoch 022: 1591 / 1689 loss=4.033, nll_loss=2.4, ppl=5.28, wps=462034, ups=1.07, wpb=433225, bsz=16573.7, num_updates=37000, lr=0.000328798, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=36570 begin validation on "valid" subset epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 epoch 022 | valid on 'valid' subset | loss 4.259 | nll_loss 2.639 | ppl 6.23 | wps 0 | wpb 42662 | bsz 2032 | num_updates 37000 | best_loss 4.249 end of epoch 22 (average epoch stats below) epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 epoch 022 | loss 4.008 | nll_loss 2.372 | ppl 5.18 | wps 449325 | ups 1.04 | wpb 433538 | bsz 16506.5 | num_updates 37098 | lr 0.000328363 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1573 | gb_free 21 | wall 36681 Start iterating over samples epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 2 / 1689 loss=4.016, nll_loss=2.382, ppl=5.21, wps=372626, ups=0.87, wpb=428403, bsz=16375.4, num_updates=37100, lr=0.000328355, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=19.5, wall=36685 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 102 / 1689 loss=3.982, nll_loss=2.341, ppl=5.07, wps=467322, ups=1.08, wpb=434633, bsz=16575.4, num_updates=37200, lr=0.000327913, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.9, wall=36778 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 202 / 1689 loss=3.988, nll_loss=2.349, ppl=5.09, wps=466487, ups=1.08, wpb=433934, bsz=17056.3, num_updates=37300, lr=0.000327473, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=36871 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 302 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=459400, ups=1.06, wpb=434860, bsz=16834.5, num_updates=37400, lr=0.000327035, gnorm=0.204, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=36966 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 403 / 1689 loss=3.998, nll_loss=2.36, ppl=5.13, wps=460456, ups=1.06, wpb=434154, bsz=16625.8, num_updates=37500, lr=0.000326599, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=37060 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 503 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=458317, ups=1.06, wpb=433968, bsz=16748.1, num_updates=37600, lr=0.000326164, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=37155 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 603 / 1689 loss=3.992, nll_loss=2.353, ppl=5.11, wps=459651, ups=1.06, wpb=433853, bsz=16191.4, num_updates=37700, lr=0.000325731, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=37249 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 703 / 1689 loss=3.999, nll_loss=2.362, ppl=5.14, wps=458800, ups=1.06, wpb=431927, bsz=16564.2, num_updates=37800, lr=0.0003253, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=37343 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 803 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=459652, ups=1.06, wpb=431966, bsz=16549.8, num_updates=37900, lr=0.000324871, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=37437 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 epoch 023: 903 / 1689 loss=4.011, nll_loss=2.376, ppl=5.19, wps=459940, ups=1.06, wpb=435044, bsz=16751.1, num_updates=38000, lr=0.000324443, gnorm=0.191, clip=0, loss_scale=1, train_wall=93, gb_free=20.3, wall=37532 begin validation on "valid" subset epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023 | valid on 'valid' subset | loss 4.258 | nll_loss 2.634 | ppl 6.21 | wps 0 | wpb 42662 | bsz 2032 | num_updates 38000 | best_loss 4.249 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1003 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=407215, ups=0.94, wpb=431678, bsz=16163.9, num_updates=38100, lr=0.000324017, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=20.4, wall=37638 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1103 / 1689 loss=4.017, nll_loss=2.382, ppl=5.21, wps=462275, ups=1.06, wpb=434876, bsz=16295.8, num_updates=38200, lr=0.000323592, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=37732 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1204 / 1689 loss=4.005, nll_loss=2.369, ppl=5.16, wps=458882, ups=1.06, wpb=433740, bsz=16203.5, num_updates=38300, lr=0.00032317, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=37826 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1304 / 1689 loss=4.014, nll_loss=2.379, ppl=5.2, wps=466542, ups=1.07, wpb=435365, bsz=16491, num_updates=38400, lr=0.000322749, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=37920 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1404 / 1689 loss=4.008, nll_loss=2.372, ppl=5.18, wps=459012, ups=1.06, wpb=433501, bsz=16549, num_updates=38500, lr=0.000322329, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38014 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1504 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=459350, ups=1.06, wpb=433494, bsz=16502.1, num_updates=38600, lr=0.000321911, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=38108 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 epoch 023: 1604 / 1689 loss=4.009, nll_loss=2.373, ppl=5.18, wps=458501, ups=1.06, wpb=432691, bsz=16383.8, num_updates=38700, lr=0.000321495, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.4, wall=38203 end of epoch 23 (average epoch stats below) epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 epoch 023 | loss 4.001 | nll_loss 2.364 | ppl 5.15 | wps 456713 | ups 1.05 | wpb 433515 | bsz 16506.6 | num_updates 38785 | lr 0.000321143 | gnorm 0.212 | clip 0 | loss_scale 1 | train_wall 1563 | gb_free 22.2 | wall 38282 Start iterating over samples epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 16 / 1689 loss=4.011, nll_loss=2.375, ppl=5.19, wps=451000, ups=1.05, wpb=430209, bsz=16100.6, num_updates=38800, lr=0.000321081, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20, wall=38298 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 116 / 1689 loss=3.977, nll_loss=2.336, ppl=5.05, wps=456872, ups=1.05, wpb=433578, bsz=16765.2, num_updates=38900, lr=0.000320668, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=38393 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 epoch 024: 216 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=455619, ups=1.05, wpb=431905, bsz=16745.8, num_updates=39000, lr=0.000320256, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=38488 begin validation on "valid" subset epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024 | valid on 'valid' subset | loss 4.252 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 39000 | best_loss 4.249 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 316 / 1689 loss=3.983, nll_loss=2.342, ppl=5.07, wps=310348, ups=0.72, wpb=433589, bsz=16203.8, num_updates=39100, lr=0.000319847, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=118, gb_free=19.1, wall=38628 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 416 / 1689 loss=3.989, nll_loss=2.349, ppl=5.1, wps=459746, ups=1.06, wpb=435590, bsz=16486.4, num_updates=39200, lr=0.000319438, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.7, wall=38722 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 516 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=462932, ups=1.07, wpb=433726, bsz=16666.2, num_updates=39300, lr=0.000319032, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=38816 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 617 / 1689 loss=3.993, nll_loss=2.355, ppl=5.12, wps=455917, ups=1.05, wpb=435090, bsz=16346.5, num_updates=39400, lr=0.000318626, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=38911 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 717 / 1689 loss=3.981, nll_loss=2.341, ppl=5.07, wps=457460, ups=1.05, wpb=434885, bsz=16185.6, num_updates=39500, lr=0.000318223, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39006 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 817 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=454238, ups=1.05, wpb=432352, bsz=16304.4, num_updates=39600, lr=0.000317821, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=39102 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 917 / 1689 loss=4.004, nll_loss=2.367, ppl=5.16, wps=456837, ups=1.05, wpb=434313, bsz=16599.7, num_updates=39700, lr=0.00031742, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=39197 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1017 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=455483, ups=1.05, wpb=433057, bsz=16566.6, num_updates=39800, lr=0.000317021, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=39292 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1117 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=455426, ups=1.05, wpb=434021, bsz=17097.4, num_updates=39900, lr=0.000316624, gnorm=0.219, clip=0, loss_scale=1, train_wall=94, gb_free=18.8, wall=39387 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 epoch 024: 1218 / 1689 loss=4.001, nll_loss=2.365, ppl=5.15, wps=458087, ups=1.05, wpb=435244, bsz=16607.6, num_updates=40000, lr=0.000316228, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.7, wall=39482 begin validation on "valid" subset epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024 | valid on 'valid' subset | loss 4.248 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 40000 | best_loss 4.248 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1318 / 1689 loss=3.991, nll_loss=2.354, ppl=5.11, wps=384569, ups=0.89, wpb=431513, bsz=16468.6, num_updates=40100, lr=0.000315833, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=39594 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1418 / 1689 loss=4.002, nll_loss=2.365, ppl=5.15, wps=461436, ups=1.06, wpb=433910, bsz=16254.6, num_updates=40200, lr=0.00031544, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=39688 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1518 / 1689 loss=4.01, nll_loss=2.374, ppl=5.18, wps=459054, ups=1.06, wpb=433106, bsz=16367.9, num_updates=40300, lr=0.000315049, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=39783 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 epoch 024: 1618 / 1689 loss=4.013, nll_loss=2.379, ppl=5.2, wps=462864, ups=1.07, wpb=433802, bsz=16614.2, num_updates=40400, lr=0.000314658, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=39876 end of epoch 24 (average epoch stats below) epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 epoch 024 | loss 3.994 | nll_loss 2.356 | ppl 5.12 | wps 440149 | ups 1.02 | wpb 433548 | bsz 16503.4 | num_updates 40470 | lr 0.000314386 | gnorm 0.212 | clip 0 | loss_scale 0.5 | train_wall 1594 | gb_free 19.8 | wall 39942 Start iterating over samples epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 30 / 1689 loss=3.996, nll_loss=2.358, ppl=5.13, wps=454776, ups=1.05, wpb=431947, bsz=16219.4, num_updates=40500, lr=0.00031427, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=39971 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 130 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=458895, ups=1.06, wpb=431668, bsz=16563.2, num_updates=40600, lr=0.000313882, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=40066 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 230 / 1689 loss=3.971, nll_loss=2.329, ppl=5.03, wps=462682, ups=1.06, wpb=435076, bsz=16504.2, num_updates=40700, lr=0.000313497, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=40160 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 330 / 1689 loss=3.983, nll_loss=2.343, ppl=5.07, wps=459849, ups=1.06, wpb=435435, bsz=16583, num_updates=40800, lr=0.000313112, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=17.4, wall=40254 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 430 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457037, ups=1.06, wpb=431478, bsz=16678.1, num_updates=40900, lr=0.000312729, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.3, wall=40349 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 epoch 025: 530 / 1689 loss=3.979, nll_loss=2.339, ppl=5.06, wps=457057, ups=1.06, wpb=432065, bsz=16609.8, num_updates=41000, lr=0.000312348, gnorm=0.199, clip=0, loss_scale=1, train_wall=93, gb_free=18.7, wall=40443 begin validation on "valid" subset epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025 | valid on 'valid' subset | loss 4.254 | nll_loss 2.632 | ppl 6.2 | wps 0 | wpb 42662 | bsz 2032 | num_updates 41000 | best_loss 4.248 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 630 / 1689 loss=3.991, nll_loss=2.353, ppl=5.11, wps=408195, ups=0.94, wpb=434234, bsz=16240.6, num_updates=41100, lr=0.000311967, gnorm=0.214, clip=0, loss_scale=1, train_wall=93, gb_free=20.9, wall=40550 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 731 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=456646, ups=1.05, wpb=433779, bsz=16820.3, num_updates=41200, lr=0.000311588, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=40645 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 831 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=461829, ups=1.06, wpb=433685, bsz=16396.9, num_updates=41300, lr=0.000311211, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=40738 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 931 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=458222, ups=1.06, wpb=433026, bsz=16806.6, num_updates=41400, lr=0.000310835, gnorm=0.233, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.9, wall=40833 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1031 / 1689 loss=4, nll_loss=2.364, ppl=5.15, wps=463278, ups=1.07, wpb=434768, bsz=16329.4, num_updates=41500, lr=0.00031046, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=40927 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1131 / 1689 loss=3.993, nll_loss=2.355, ppl=5.11, wps=464336, ups=1.07, wpb=432631, bsz=16184.7, num_updates=41600, lr=0.000310087, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=41020 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1232 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=455810, ups=1.05, wpb=435031, bsz=16357.9, num_updates=41700, lr=0.000309715, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=41115 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1332 / 1689 loss=4.001, nll_loss=2.364, ppl=5.15, wps=461363, ups=1.06, wpb=435457, bsz=16712.3, num_updates=41800, lr=0.000309344, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=41210 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1432 / 1689 loss=4.004, nll_loss=2.368, ppl=5.16, wps=458896, ups=1.06, wpb=434638, bsz=16378.4, num_updates=41900, lr=0.000308975, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=41305 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 epoch 025: 1532 / 1689 loss=4.003, nll_loss=2.367, ppl=5.16, wps=460753, ups=1.06, wpb=433874, bsz=16690.1, num_updates=42000, lr=0.000308607, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=41399 begin validation on "valid" subset epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025 | valid on 'valid' subset | loss 4.245 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 42000 | best_loss 4.245 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 epoch 025: 1632 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=226108, ups=0.52, wpb=431796, bsz=16387.2, num_updates=42100, lr=0.00030824, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=41590 end of epoch 25 (average epoch stats below) epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 epoch 025 | loss 3.988 | nll_loss 2.35 | ppl 5.1 | wps 430303 | ups 0.99 | wpb 433522 | bsz 16505.7 | num_updates 42157 | lr 0.000308032 | gnorm 0.211 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 19.3 | wall 41642 Start iterating over samples epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 43 / 1689 loss=3.98, nll_loss=2.34, ppl=5.06, wps=460225, ups=1.07, wpb=431178, bsz=16536.6, num_updates=42200, lr=0.000307875, gnorm=0.214, clip=0, loss_scale=1, train_wall=92, gb_free=18.4, wall=41683 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 143 / 1689 loss=3.961, nll_loss=2.318, ppl=4.99, wps=466497, ups=1.07, wpb=435000, bsz=16386.6, num_updates=42300, lr=0.00030751, gnorm=0.209, clip=0, loss_scale=1, train_wall=92, gb_free=18.5, wall=41777 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 243 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=463301, ups=1.07, wpb=434532, bsz=16838.8, num_updates=42400, lr=0.000307148, gnorm=0.209, clip=0, loss_scale=1, train_wall=93, gb_free=18.9, wall=41870 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 343 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=463328, ups=1.06, wpb=435245, bsz=16626.8, num_updates=42500, lr=0.000306786, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=41964 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 443 / 1689 loss=3.978, nll_loss=2.337, ppl=5.05, wps=463149, ups=1.07, wpb=433814, bsz=16235, num_updates=42600, lr=0.000306426, gnorm=0.222, clip=0, loss_scale=1, train_wall=92, gb_free=18.6, wall=42058 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 544 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=453093, ups=1.05, wpb=431652, bsz=16672.2, num_updates=42700, lr=0.000306067, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=42153 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 644 / 1689 loss=3.974, nll_loss=2.333, ppl=5.04, wps=459481, ups=1.06, wpb=433645, bsz=16313.9, num_updates=42800, lr=0.000305709, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=42248 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 744 / 1689 loss=3.987, nll_loss=2.348, ppl=5.09, wps=454394, ups=1.04, wpb=435510, bsz=16571.1, num_updates=42900, lr=0.000305352, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=42344 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 epoch 026: 844 / 1689 loss=3.988, nll_loss=2.349, ppl=5.1, wps=454909, ups=1.05, wpb=432584, bsz=16482.5, num_updates=43000, lr=0.000304997, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=42439 begin validation on "valid" subset epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026 | valid on 'valid' subset | loss 4.246 | nll_loss 2.63 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 43000 | best_loss 4.245 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 944 / 1689 loss=3.999, nll_loss=2.361, ppl=5.14, wps=409922, ups=0.94, wpb=435387, bsz=16486.5, num_updates=43100, lr=0.000304643, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.3, wall=42545 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1044 / 1689 loss=3.977, nll_loss=2.337, ppl=5.05, wps=456928, ups=1.06, wpb=431543, bsz=16317.4, num_updates=43200, lr=0.00030429, gnorm=0.216, clip=0, loss_scale=1, train_wall=92, gb_free=19.3, wall=42639 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1144 / 1689 loss=3.997, nll_loss=2.36, ppl=5.13, wps=457312, ups=1.06, wpb=432084, bsz=16963.5, num_updates=43300, lr=0.000303939, gnorm=0.226, clip=0, loss_scale=1, train_wall=93, gb_free=19.6, wall=42734 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1245 / 1689 loss=3.987, nll_loss=2.349, ppl=5.09, wps=460529, ups=1.06, wpb=433196, bsz=16193.4, num_updates=43400, lr=0.000303588, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=42828 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1345 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=460139, ups=1.06, wpb=432593, bsz=16449.2, num_updates=43500, lr=0.000303239, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=42922 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1445 / 1689 loss=3.993, nll_loss=2.356, ppl=5.12, wps=462285, ups=1.06, wpb=435376, bsz=16765.8, num_updates=43600, lr=0.000302891, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=43016 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1545 / 1689 loss=3.994, nll_loss=2.356, ppl=5.12, wps=461041, ups=1.06, wpb=433009, bsz=16430.8, num_updates=43700, lr=0.000302545, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.3, wall=43110 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 epoch 026: 1645 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=457826, ups=1.06, wpb=433250, bsz=16377.1, num_updates=43800, lr=0.000302199, gnorm=0.2, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=43205 end of epoch 26 (average epoch stats below) epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 epoch 026 | loss 3.982 | nll_loss 2.343 | ppl 5.07 | wps 455941 | ups 1.05 | wpb 433530 | bsz 16505 | num_updates 43844 | lr 0.000302047 | gnorm 0.214 | clip 0 | loss_scale 1 | train_wall 1566 | gb_free 19.2 | wall 43246 Start iterating over samples epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 57 / 1689 loss=3.966, nll_loss=2.324, ppl=5.01, wps=450810, ups=1.05, wpb=430248, bsz=16047.4, num_updates=43900, lr=0.000301855, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=43300 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 epoch 027: 157 / 1689 loss=3.965, nll_loss=2.322, ppl=5, wps=457621, ups=1.05, wpb=433954, bsz=16704.1, num_updates=44000, lr=0.000301511, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=43395 begin validation on "valid" subset epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 44000 | best_loss 4.245 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 257 / 1689 loss=3.956, nll_loss=2.312, ppl=4.97, wps=378003, ups=0.87, wpb=433854, bsz=16072.9, num_updates=44100, lr=0.000301169, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=98, gb_free=18.8, wall=43510 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 357 / 1689 loss=3.958, nll_loss=2.315, ppl=4.98, wps=460116, ups=1.06, wpb=434549, bsz=16439.1, num_updates=44200, lr=0.000300828, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=43604 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 457 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=459451, ups=1.06, wpb=432554, bsz=16429.4, num_updates=44300, lr=0.000300489, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=43698 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 557 / 1689 loss=3.967, nll_loss=2.325, ppl=5.01, wps=460426, ups=1.06, wpb=433055, bsz=16391, num_updates=44400, lr=0.00030015, gnorm=0.207, clip=0, loss_scale=1, train_wall=92, gb_free=19.5, wall=43792 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 657 / 1689 loss=3.986, nll_loss=2.347, ppl=5.09, wps=456078, ups=1.05, wpb=435174, bsz=16442.2, num_updates=44500, lr=0.000299813, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.1, wall=43888 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 758 / 1689 loss=3.984, nll_loss=2.345, ppl=5.08, wps=455878, ups=1.05, wpb=434826, bsz=16601, num_updates=44600, lr=0.000299476, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.6, wall=43983 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 858 / 1689 loss=3.989, nll_loss=2.35, ppl=5.1, wps=459225, ups=1.05, wpb=435389, bsz=16710.4, num_updates=44700, lr=0.000299141, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=44078 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 958 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=455144, ups=1.05, wpb=431491, bsz=16282.8, num_updates=44800, lr=0.000298807, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44173 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1058 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=457956, ups=1.06, wpb=433912, bsz=16891.6, num_updates=44900, lr=0.000298474, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=44267 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 epoch 027: 1158 / 1689 loss=3.995, nll_loss=2.357, ppl=5.12, wps=456435, ups=1.05, wpb=433889, bsz=16953.6, num_updates=45000, lr=0.000298142, gnorm=0.224, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=44363 begin validation on "valid" subset epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 45000 | best_loss 4.245 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1258 / 1689 loss=3.979, nll_loss=2.34, ppl=5.06, wps=402938, ups=0.92, wpb=436565, bsz=16412.4, num_updates=45100, lr=0.000297812, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=44471 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1359 / 1689 loss=3.982, nll_loss=2.344, ppl=5.08, wps=456713, ups=1.05, wpb=433319, bsz=16403.8, num_updates=45200, lr=0.000297482, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=44566 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1459 / 1689 loss=3.972, nll_loss=2.333, ppl=5.04, wps=459765, ups=1.06, wpb=431776, bsz=16502.4, num_updates=45300, lr=0.000297154, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44660 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1559 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=462053, ups=1.07, wpb=432203, bsz=16428.3, num_updates=45400, lr=0.000296826, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.4, wall=44753 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 epoch 027: 1659 / 1689 loss=3.996, nll_loss=2.359, ppl=5.13, wps=461133, ups=1.07, wpb=432680, bsz=16673.2, num_updates=45500, lr=0.0002965, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44847 end of epoch 27 (average epoch stats below) epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 epoch 027 | loss 3.976 | nll_loss 2.336 | ppl 5.05 | wps 448781 | ups 1.04 | wpb 433518 | bsz 16504.1 | num_updates 45530 | lr 0.000296402 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.6 | wall 44875 Start iterating over samples epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 70 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=456366, ups=1.06, wpb=431036, bsz=16548.3, num_updates=45600, lr=0.000296174, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=44942 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 170 / 1689 loss=3.947, nll_loss=2.303, ppl=4.93, wps=460710, ups=1.06, wpb=434184, bsz=16512.2, num_updates=45700, lr=0.00029585, gnorm=0.215, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=45036 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 270 / 1689 loss=3.957, nll_loss=2.314, ppl=4.97, wps=459788, ups=1.06, wpb=433170, bsz=16473.1, num_updates=45800, lr=0.000295527, gnorm=0.212, clip=0, loss_scale=1, train_wall=93, gb_free=19.7, wall=45130 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 370 / 1689 loss=3.971, nll_loss=2.33, ppl=5.03, wps=461842, ups=1.06, wpb=433852, bsz=16564.8, num_updates=45900, lr=0.000295205, gnorm=0.22, clip=0, loss_scale=1, train_wall=92, gb_free=18.8, wall=45224 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 epoch 028: 470 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=460337, ups=1.06, wpb=433412, bsz=16642.5, num_updates=46000, lr=0.000294884, gnorm=0.225, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=45318 begin validation on "valid" subset epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028 | valid on 'valid' subset | loss 4.251 | nll_loss 2.631 | ppl 6.19 | wps 0 | wpb 42662 | bsz 2032 | num_updates 46000 | best_loss 4.245 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 571 / 1689 loss=3.969, nll_loss=2.328, ppl=5.02, wps=402568, ups=0.93, wpb=432694, bsz=16939, num_updates=46100, lr=0.000294564, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=95, gb_free=20, wall=45426 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 671 / 1689 loss=3.967, nll_loss=2.326, ppl=5.01, wps=464169, ups=1.07, wpb=435411, bsz=16503.5, num_updates=46200, lr=0.000294245, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.6, wall=45519 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 771 / 1689 loss=3.975, nll_loss=2.335, ppl=5.04, wps=461338, ups=1.07, wpb=432702, bsz=16453.2, num_updates=46300, lr=0.000293927, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=45613 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 871 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=456706, ups=1.06, wpb=431277, bsz=16418.8, num_updates=46400, lr=0.00029361, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=45708 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 971 / 1689 loss=3.982, nll_loss=2.343, ppl=5.07, wps=464399, ups=1.07, wpb=434900, bsz=16471.8, num_updates=46500, lr=0.000293294, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=45801 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1072 / 1689 loss=3.985, nll_loss=2.346, ppl=5.09, wps=455519, ups=1.05, wpb=432575, bsz=16076.6, num_updates=46600, lr=0.000292979, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.6, wall=45896 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1172 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=461448, ups=1.06, wpb=434016, bsz=16318.5, num_updates=46700, lr=0.000292666, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=45990 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1272 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=461172, ups=1.06, wpb=434554, bsz=16395.8, num_updates=46800, lr=0.000292353, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.2, wall=46084 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1372 / 1689 loss=3.994, nll_loss=2.357, ppl=5.12, wps=459312, ups=1.06, wpb=433617, bsz=16596.6, num_updates=46900, lr=0.000292041, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=46179 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 epoch 028: 1472 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=459010, ups=1.06, wpb=431727, bsz=16658.5, num_updates=47000, lr=0.00029173, gnorm=0.237, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=46273 begin validation on "valid" subset epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028 | valid on 'valid' subset | loss 4.238 | nll_loss 2.616 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 47000 | best_loss 4.238 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1572 / 1689 loss=3.986, nll_loss=2.348, ppl=5.09, wps=307258, ups=0.71, wpb=433812, bsz=16467.9, num_updates=47100, lr=0.00029142, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=111, gb_free=19.7, wall=46414 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 epoch 028: 1672 / 1689 loss=3.985, nll_loss=2.347, ppl=5.09, wps=459953, ups=1.05, wpb=436557, bsz=16721.7, num_updates=47200, lr=0.000291111, gnorm=0.23, clip=0, loss_scale=1, train_wall=93, gb_free=20.7, wall=46509 end of epoch 28 (average epoch stats below) epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 epoch 028 | loss 3.971 | nll_loss 2.331 | ppl 5.03 | wps 443211 | ups 1.02 | wpb 433516 | bsz 16505.4 | num_updates 47217 | lr 0.000291059 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 1586 | gb_free 21.1 | wall 46525 Start iterating over samples epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 83 / 1689 loss=3.953, nll_loss=2.309, ppl=4.95, wps=454574, ups=1.06, wpb=429913, bsz=16340.7, num_updates=47300, lr=0.000290803, gnorm=0.206, clip=0, loss_scale=1, train_wall=92, gb_free=19.8, wall=46604 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 183 / 1689 loss=3.951, nll_loss=2.307, ppl=4.95, wps=455598, ups=1.05, wpb=434454, bsz=16928.6, num_updates=47400, lr=0.000290496, gnorm=0.222, clip=0, loss_scale=1, train_wall=94, gb_free=19.7, wall=46699 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 283 / 1689 loss=3.949, nll_loss=2.305, ppl=4.94, wps=463190, ups=1.07, wpb=433404, bsz=16336.5, num_updates=47500, lr=0.000290191, gnorm=0.223, clip=0, loss_scale=1, train_wall=92, gb_free=19.6, wall=46793 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 383 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=459181, ups=1.06, wpb=431156, bsz=16105.8, num_updates=47600, lr=0.000289886, gnorm=0.218, clip=0, loss_scale=1, train_wall=92, gb_free=18.7, wall=46886 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 484 / 1689 loss=3.955, nll_loss=2.312, ppl=4.97, wps=451700, ups=1.04, wpb=433866, bsz=16814, num_updates=47700, lr=0.000289581, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.1, wall=46983 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 584 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=453520, ups=1.05, wpb=433363, bsz=16769.5, num_updates=47800, lr=0.000289278, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=47078 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 684 / 1689 loss=3.976, nll_loss=2.336, ppl=5.05, wps=453575, ups=1.05, wpb=432823, bsz=16601.2, num_updates=47900, lr=0.000288976, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=47174 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 epoch 029: 784 / 1689 loss=3.965, nll_loss=2.323, ppl=5, wps=458273, ups=1.06, wpb=433818, bsz=16356.4, num_updates=48000, lr=0.000288675, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.8, wall=47268 begin validation on "valid" subset epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029 | valid on 'valid' subset | loss 4.246 | nll_loss 2.626 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 48000 | best_loss 4.238 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 885 / 1689 loss=3.967, nll_loss=2.327, ppl=5.02, wps=353496, ups=0.82, wpb=433173, bsz=16511.6, num_updates=48100, lr=0.000288375, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=98, gb_free=18.8, wall=47391 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 985 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=462360, ups=1.07, wpb=431444, bsz=16602.8, num_updates=48200, lr=0.000288076, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.2, wall=47484 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1085 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=465146, ups=1.07, wpb=434368, bsz=16397.3, num_updates=48300, lr=0.000287777, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.7, wall=47577 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1185 / 1689 loss=3.98, nll_loss=2.341, ppl=5.07, wps=464235, ups=1.07, wpb=434711, bsz=16751.8, num_updates=48400, lr=0.00028748, gnorm=0.204, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.3, wall=47671 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1285 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=465162, ups=1.07, wpb=434944, bsz=16656.7, num_updates=48500, lr=0.000287183, gnorm=0.22, clip=0, loss_scale=0.25, train_wall=92, gb_free=18.9, wall=47765 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1385 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464642, ups=1.07, wpb=433685, bsz=16521.8, num_updates=48600, lr=0.000286888, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=47858 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1485 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464713, ups=1.07, wpb=435551, bsz=16333.4, num_updates=48700, lr=0.000286593, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.7, wall=47952 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1585 / 1689 loss=3.971, nll_loss=2.331, ppl=5.03, wps=458792, ups=1.06, wpb=433643, bsz=16240.3, num_updates=48800, lr=0.000286299, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=48046 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 epoch 029: 1685 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=461951, ups=1.06, wpb=436760, bsz=16349, num_updates=48900, lr=0.000286006, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.6, wall=48141 end of epoch 29 (average epoch stats below) epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 epoch 029 | loss 3.966 | nll_loss 2.325 | ppl 5.01 | wps 451740 | ups 1.04 | wpb 433518 | bsz 16506.2 | num_updates 48904 | lr 0.000285995 | gnorm 0.214 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 22.6 | wall 48144 Start iterating over samples epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 epoch 030: 96 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=454021, ups=1.06, wpb=430089, bsz=16533.6, num_updates=49000, lr=0.000285714, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.6, wall=48235 begin validation on "valid" subset epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.248 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 49000 | best_loss 4.238 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 196 / 1689 loss=3.939, nll_loss=2.293, ppl=4.9, wps=404121, ups=0.93, wpb=433620, bsz=16209.4, num_updates=49100, lr=0.000285423, gnorm=0.217, clip=0, loss_scale=1, train_wall=92, gb_free=18.9, wall=48343 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 296 / 1689 loss=3.955, nll_loss=2.311, ppl=4.96, wps=460529, ups=1.06, wpb=433866, bsz=16231.3, num_updates=49200, lr=0.000285133, gnorm=0.203, clip=0, loss_scale=1, train_wall=93, gb_free=18.6, wall=48437 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 397 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=456490, ups=1.05, wpb=433901, bsz=16330, num_updates=49300, lr=0.000284844, gnorm=0.214, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48532 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 497 / 1689 loss=3.951, nll_loss=2.308, ppl=4.95, wps=461406, ups=1.07, wpb=432169, bsz=16891.4, num_updates=49400, lr=0.000284555, gnorm=0.23, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=48626 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 597 / 1689 loss=3.959, nll_loss=2.316, ppl=4.98, wps=457334, ups=1.06, wpb=432612, bsz=16532.5, num_updates=49500, lr=0.000284268, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=48720 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 697 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460104, ups=1.06, wpb=433226, bsz=16327.3, num_updates=49600, lr=0.000283981, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=48814 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 797 / 1689 loss=3.96, nll_loss=2.318, ppl=4.98, wps=464993, ups=1.07, wpb=435217, bsz=16362.6, num_updates=49700, lr=0.000283695, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=48908 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 898 / 1689 loss=3.974, nll_loss=2.334, ppl=5.04, wps=458550, ups=1.05, wpb=435418, bsz=16369.6, num_updates=49800, lr=0.00028341, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=49003 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 998 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=461931, ups=1.07, wpb=432720, bsz=16584.4, num_updates=49900, lr=0.000283126, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49097 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 epoch 030: 1098 / 1689 loss=3.959, nll_loss=2.318, ppl=4.98, wps=460620, ups=1.07, wpb=431590, bsz=16614.4, num_updates=50000, lr=0.000282843, gnorm=0.202, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.1, wall=49190 begin validation on "valid" subset epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 50000 | best_loss 4.238 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1198 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=321121, ups=0.74, wpb=432087, bsz=16977.2, num_updates=50100, lr=0.00028256, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=49325 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1298 / 1689 loss=3.981, nll_loss=2.342, ppl=5.07, wps=469038, ups=1.08, wpb=435966, bsz=16262.1, num_updates=50200, lr=0.000282279, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=92, gb_free=18, wall=49418 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1398 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=463903, ups=1.07, wpb=433166, bsz=16644.2, num_updates=50300, lr=0.000281998, gnorm=0.227, clip=0, loss_scale=1, train_wall=93, gb_free=19.1, wall=49511 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1499 / 1689 loss=3.972, nll_loss=2.332, ppl=5.03, wps=459677, ups=1.06, wpb=432910, bsz=16668.6, num_updates=50400, lr=0.000281718, gnorm=0.228, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=49605 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 epoch 030: 1599 / 1689 loss=3.978, nll_loss=2.339, ppl=5.06, wps=464756, ups=1.06, wpb=437611, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=49700 end of epoch 30 (average epoch stats below) epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 epoch 030 | loss 3.961 | nll_loss 2.32 | ppl 4.99 | wps 445849 | ups 1.03 | wpb 433544 | bsz 16502.7 | num_updates 50590 | lr 0.000281189 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1566 | gb_free 20.7 | wall 49783 Start iterating over samples epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 10 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=456597, ups=1.06, wpb=430359, bsz=16428.4, num_updates=50600, lr=0.000281161, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.8, wall=49794 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 110 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=459785, ups=1.07, wpb=431663, bsz=16432.2, num_updates=50700, lr=0.000280883, gnorm=0.229, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=49888 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 210 / 1689 loss=3.938, nll_loss=2.293, ppl=4.9, wps=460184, ups=1.06, wpb=433952, bsz=16740.4, num_updates=50800, lr=0.000280607, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=93, gb_free=21.1, wall=49982 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 310 / 1689 loss=3.944, nll_loss=2.299, ppl=4.92, wps=461648, ups=1.06, wpb=434464, bsz=16261.1, num_updates=50900, lr=0.000280331, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50076 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 epoch 031: 411 / 1689 loss=3.943, nll_loss=2.298, ppl=4.92, wps=455044, ups=1.05, wpb=432656, bsz=16396.1, num_updates=51000, lr=0.000280056, gnorm=0.201, clip=0, loss_scale=0.5, train_wall=94, gb_free=20.2, wall=50171 begin validation on "valid" subset epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.246 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 51000 | best_loss 4.238 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 511 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=407002, ups=0.94, wpb=435165, bsz=16759.4, num_updates=51100, lr=0.000279782, gnorm=0.225, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.2, wall=50278 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 611 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=457917, ups=1.06, wpb=433776, bsz=16553.5, num_updates=51200, lr=0.000279508, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=50373 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 711 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=463899, ups=1.07, wpb=435104, bsz=16586.1, num_updates=51300, lr=0.000279236, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=50467 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 811 / 1689 loss=3.968, nll_loss=2.327, ppl=5.02, wps=467165, ups=1.07, wpb=435906, bsz=16163.2, num_updates=51400, lr=0.000278964, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=92, gb_free=20.2, wall=50560 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 911 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=460150, ups=1.06, wpb=435361, bsz=16430, num_updates=51500, lr=0.000278693, gnorm=0.231, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=50655 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1011 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=456058, ups=1.06, wpb=430705, bsz=16778.2, num_updates=51600, lr=0.000278423, gnorm=0.217, clip=0, loss_scale=1, train_wall=93, gb_free=19.8, wall=50749 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1111 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=461010, ups=1.07, wpb=432802, bsz=16017.6, num_updates=51700, lr=0.000278154, gnorm=0.246, clip=0, loss_scale=1, train_wall=92, gb_free=19.4, wall=50843 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1211 / 1689 loss=3.961, nll_loss=2.319, ppl=4.99, wps=456198, ups=1.05, wpb=433933, bsz=16657.2, num_updates=51800, lr=0.000277885, gnorm=0.206, clip=0, loss_scale=1, train_wall=93, gb_free=18.1, wall=50938 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1312 / 1689 loss=3.962, nll_loss=2.321, ppl=5, wps=452349, ups=1.05, wpb=432432, bsz=16308.8, num_updates=51900, lr=0.000277617, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.1, wall=51034 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 epoch 031: 1412 / 1689 loss=3.97, nll_loss=2.33, ppl=5.03, wps=457316, ups=1.05, wpb=434696, bsz=16865.9, num_updates=52000, lr=0.00027735, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=51129 begin validation on "valid" subset epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031 | valid on 'valid' subset | loss 4.247 | nll_loss 2.625 | ppl 6.17 | wps 0 | wpb 42662 | bsz 2032 | num_updates 52000 | best_loss 4.238 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1512 / 1689 loss=3.968, nll_loss=2.328, ppl=5.02, wps=377464, ups=0.87, wpb=433860, bsz=16918.2, num_updates=52100, lr=0.000277084, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=96, gb_free=19.3, wall=51244 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 epoch 031: 1613 / 1689 loss=3.976, nll_loss=2.337, ppl=5.05, wps=458300, ups=1.05, wpb=434427, bsz=16292.6, num_updates=52200, lr=0.000276818, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=94, gb_free=21, wall=51338 end of epoch 31 (average epoch stats below) epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 epoch 031 | loss 3.957 | nll_loss 2.314 | ppl 4.97 | wps 449502 | ups 1.04 | wpb 433536 | bsz 16506.8 | num_updates 52276 | lr 0.000276617 | gnorm 0.218 | clip 0 | loss_scale 0.25 | train_wall 1570 | gb_free 20.3 | wall 51409 Start iterating over samples epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 24 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=452517, ups=1.06, wpb=428898, bsz=16617, num_updates=52300, lr=0.000276553, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=51433 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 124 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=458908, ups=1.06, wpb=432536, bsz=16715, num_updates=52400, lr=0.000276289, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=51528 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 225 / 1689 loss=3.935, nll_loss=2.289, ppl=4.89, wps=453900, ups=1.05, wpb=433699, bsz=16490.2, num_updates=52500, lr=0.000276026, gnorm=0.248, clip=1, loss_scale=0.125, train_wall=95, gb_free=20.7, wall=51623 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 325 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=459516, ups=1.06, wpb=432291, bsz=16466.2, num_updates=52600, lr=0.000275764, gnorm=0.24, clip=1, loss_scale=0.125, train_wall=93, gb_free=18.9, wall=51717 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 425 / 1689 loss=3.956, nll_loss=2.313, ppl=4.97, wps=459880, ups=1.06, wpb=435195, bsz=16636.6, num_updates=52700, lr=0.000275502, gnorm=0.218, clip=0, loss_scale=0.125, train_wall=94, gb_free=20.5, wall=51812 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 525 / 1689 loss=3.944, nll_loss=2.3, ppl=4.92, wps=456760, ups=1.05, wpb=434108, bsz=16778.4, num_updates=52800, lr=0.000275241, gnorm=0.21, clip=0, loss_scale=0.125, train_wall=94, gb_free=18.4, wall=51907 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 625 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=459554, ups=1.06, wpb=432479, bsz=16380.5, num_updates=52900, lr=0.000274981, gnorm=0.228, clip=0, loss_scale=0.125, train_wall=93, gb_free=19.2, wall=52001 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 epoch 032: 725 / 1689 loss=3.948, nll_loss=2.305, ppl=4.94, wps=460951, ups=1.06, wpb=433303, bsz=16640.6, num_updates=53000, lr=0.000274721, gnorm=0.221, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.4, wall=52095 begin validation on "valid" subset epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032 | valid on 'valid' subset | loss 4.249 | nll_loss 2.627 | ppl 6.18 | wps 0 | wpb 42662 | bsz 2032 | num_updates 53000 | best_loss 4.238 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 825 / 1689 loss=3.957, nll_loss=2.315, ppl=4.97, wps=402141, ups=0.92, wpb=435985, bsz=15989.8, num_updates=53100, lr=0.000274462, gnorm=0.224, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=52203 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 925 / 1689 loss=3.958, nll_loss=2.316, ppl=4.98, wps=463096, ups=1.07, wpb=433419, bsz=16550, num_updates=53200, lr=0.000274204, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19, wall=52297 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1025 / 1689 loss=3.952, nll_loss=2.309, ppl=4.95, wps=460724, ups=1.06, wpb=432860, bsz=16105.4, num_updates=53300, lr=0.000273947, gnorm=0.21, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.4, wall=52391 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1125 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=461447, ups=1.06, wpb=435419, bsz=16459, num_updates=53400, lr=0.00027369, gnorm=0.215, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=52485 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1225 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=457432, ups=1.05, wpb=433720, bsz=16273.3, num_updates=53500, lr=0.000273434, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=52580 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1325 / 1689 loss=3.952, nll_loss=2.31, ppl=4.96, wps=456856, ups=1.06, wpb=432464, bsz=16180.7, num_updates=53600, lr=0.000273179, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=52675 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1425 / 1689 loss=3.963, nll_loss=2.322, ppl=5, wps=458737, ups=1.06, wpb=433299, bsz=16831.4, num_updates=53700, lr=0.000272925, gnorm=0.212, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.1, wall=52769 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1525 / 1689 loss=3.964, nll_loss=2.323, ppl=5.01, wps=458275, ups=1.05, wpb=435532, bsz=16668.4, num_updates=53800, lr=0.000272671, gnorm=0.206, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.3, wall=52864 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 epoch 032: 1625 / 1689 loss=3.949, nll_loss=2.306, ppl=4.95, wps=455758, ups=1.05, wpb=432957, bsz=16568.6, num_updates=53900, lr=0.000272418, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.5, wall=52959 end of epoch 32 (average epoch stats below) epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 epoch 032 | loss 3.953 | nll_loss 2.31 | ppl 4.96 | wps 454655 | ups 1.05 | wpb 433528 | bsz 16506.6 | num_updates 53964 | lr 0.000272256 | gnorm 0.219 | clip 0.1 | loss_scale 0.5 | train_wall 1568 | gb_free 22.2 | wall 53019 Start iterating over samples epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 epoch 033: 36 / 1689 loss=3.966, nll_loss=2.325, ppl=5.01, wps=458611, ups=1.06, wpb=431242, bsz=16440, num_updates=54000, lr=0.000272166, gnorm=0.219, clip=0, loss_scale=1, train_wall=92, gb_free=19.9, wall=53053 begin validation on "valid" subset epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.239 | nll_loss 2.618 | ppl 6.14 | wps 0 | wpb 42662 | bsz 2032 | num_updates 54000 | best_loss 4.238 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 137 / 1689 loss=3.923, nll_loss=2.276, ppl=4.84, wps=364873, ups=0.84, wpb=432589, bsz=16496.8, num_updates=54100, lr=0.000271914, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=100, gb_free=19.2, wall=53172 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 237 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=464920, ups=1.07, wpb=432881, bsz=16833, num_updates=54200, lr=0.000271663, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.7, wall=53265 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 337 / 1689 loss=3.926, nll_loss=2.279, ppl=4.85, wps=461438, ups=1.06, wpb=434409, bsz=16213.8, num_updates=54300, lr=0.000271413, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53359 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 437 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=459540, ups=1.06, wpb=432391, bsz=16371, num_updates=54400, lr=0.000271163, gnorm=0.197, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53453 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 537 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=463155, ups=1.06, wpb=434915, bsz=16545, num_updates=54500, lr=0.000270914, gnorm=0.203, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=53547 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 638 / 1689 loss=3.95, nll_loss=2.307, ppl=4.95, wps=451334, ups=1.04, wpb=433490, bsz=16814, num_updates=54600, lr=0.000270666, gnorm=0.21, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=53643 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 738 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=459383, ups=1.06, wpb=433003, bsz=16439, num_updates=54700, lr=0.000270418, gnorm=0.22, clip=0, loss_scale=0.5, train_wall=93, gb_free=20.4, wall=53737 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 838 / 1689 loss=3.952, nll_loss=2.309, ppl=4.96, wps=460680, ups=1.06, wpb=434228, bsz=16709, num_updates=54800, lr=0.000270172, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.8, wall=53832 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 938 / 1689 loss=3.943, nll_loss=2.299, ppl=4.92, wps=461232, ups=1.06, wpb=433585, bsz=16024.8, num_updates=54900, lr=0.000269925, gnorm=0.199, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.4, wall=53926 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 epoch 033: 1038 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=455569, ups=1.05, wpb=432873, bsz=16549, num_updates=55000, lr=0.00026968, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=94, gb_free=18.7, wall=54021 begin validation on "valid" subset epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033 | valid on 'valid' subset | loss 4.24 | nll_loss 2.624 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 55000 | best_loss 4.238 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1138 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=306087, ups=0.7, wpb=434225, bsz=16487.4, num_updates=55100, lr=0.000269435, gnorm=0.214, clip=0, loss_scale=1, train_wall=106, gb_free=18.9, wall=54163 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1239 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=459738, ups=1.06, wpb=434186, bsz=17207, num_updates=55200, lr=0.000269191, gnorm=0.243, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.1, wall=54257 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1339 / 1689 loss=3.963, nll_loss=2.321, ppl=5, wps=464088, ups=1.07, wpb=432568, bsz=16342.6, num_updates=55300, lr=0.000268947, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=92, gb_free=18.8, wall=54350 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1439 / 1689 loss=3.964, nll_loss=2.323, ppl=5, wps=461743, ups=1.06, wpb=433564, bsz=16807, num_updates=55400, lr=0.000268705, gnorm=0.221, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.5, wall=54444 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1539 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=464029, ups=1.07, wpb=434586, bsz=16666.3, num_updates=55500, lr=0.000268462, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=54538 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 epoch 033: 1640 / 1689 loss=3.965, nll_loss=2.324, ppl=5.01, wps=457354, ups=1.05, wpb=435543, bsz=16042.6, num_updates=55600, lr=0.000268221, gnorm=0.197, clip=0, loss_scale=0.25, train_wall=94, gb_free=19.2, wall=54633 end of epoch 33 (average epoch stats below) epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 epoch 033 | loss 3.948 | nll_loss 2.305 | ppl 4.94 | wps 440169 | ups 1.02 | wpb 433558 | bsz 16504.6 | num_updates 55649 | lr 0.000268103 | gnorm 0.216 | clip 0 | loss_scale 0.25 | train_wall 1588 | gb_free 21.6 | wall 54678 Start iterating over samples epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 51 / 1689 loss=3.941, nll_loss=2.296, ppl=4.91, wps=451536, ups=1.05, wpb=431523, bsz=16343.5, num_updates=55700, lr=0.00026798, gnorm=0.208, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.5, wall=54729 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 151 / 1689 loss=3.929, nll_loss=2.282, ppl=4.86, wps=457670, ups=1.05, wpb=434336, bsz=16829.6, num_updates=55800, lr=0.00026774, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.8, wall=54823 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 251 / 1689 loss=3.924, nll_loss=2.276, ppl=4.84, wps=463444, ups=1.07, wpb=432372, bsz=16128.6, num_updates=55900, lr=0.0002675, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=17.3, wall=54917 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 epoch 034: 351 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=458894, ups=1.06, wpb=432915, bsz=16360.5, num_updates=56000, lr=0.000267261, gnorm=0.209, clip=0, loss_scale=0.25, train_wall=92, gb_free=20.6, wall=55011 begin validation on "valid" subset epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.244 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 56000 | best_loss 4.238 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 451 / 1689 loss=3.939, nll_loss=2.294, ppl=4.9, wps=407773, ups=0.94, wpb=433740, bsz=16524.2, num_updates=56100, lr=0.000267023, gnorm=0.209, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=55117 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 551 / 1689 loss=3.935, nll_loss=2.29, ppl=4.89, wps=458792, ups=1.06, wpb=433239, bsz=16571.3, num_updates=56200, lr=0.000266785, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=55212 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 651 / 1689 loss=3.948, nll_loss=2.304, ppl=4.94, wps=461240, ups=1.06, wpb=433492, bsz=16686.2, num_updates=56300, lr=0.000266548, gnorm=0.217, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=55306 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 751 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=459566, ups=1.06, wpb=433280, bsz=16693.2, num_updates=56400, lr=0.000266312, gnorm=0.216, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.7, wall=55400 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 851 / 1689 loss=3.946, nll_loss=2.302, ppl=4.93, wps=459835, ups=1.06, wpb=433674, bsz=16439.5, num_updates=56500, lr=0.000266076, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=55495 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 952 / 1689 loss=3.944, nll_loss=2.301, ppl=4.93, wps=456048, ups=1.05, wpb=433744, bsz=16472.2, num_updates=56600, lr=0.000265841, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.5, wall=55590 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1053 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=452745, ups=1.05, wpb=432070, bsz=16217, num_updates=56700, lr=0.000265606, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=94, gb_free=18.8, wall=55685 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1153 / 1689 loss=3.947, nll_loss=2.303, ppl=4.94, wps=454330, ups=1.05, wpb=431916, bsz=16647.8, num_updates=56800, lr=0.000265372, gnorm=0.225, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55780 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1253 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456324, ups=1.05, wpb=433177, bsz=16619.3, num_updates=56900, lr=0.000265139, gnorm=0.231, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.8, wall=55875 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 epoch 034: 1353 / 1689 loss=3.956, nll_loss=2.314, ppl=4.97, wps=458192, ups=1.05, wpb=434602, bsz=16734.7, num_updates=57000, lr=0.000264906, gnorm=0.207, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.9, wall=55970 begin validation on "valid" subset epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034 | valid on 'valid' subset | loss 4.247 | nll_loss 2.623 | ppl 6.16 | wps 0 | wpb 42662 | bsz 2032 | num_updates 57000 | best_loss 4.238 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1453 / 1689 loss=3.957, nll_loss=2.315, ppl=4.98, wps=373598, ups=0.86, wpb=434936, bsz=16380.5, num_updates=57100, lr=0.000264674, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=98, gb_free=19.1, wall=56086 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1553 / 1689 loss=3.959, nll_loss=2.317, ppl=4.98, wps=462614, ups=1.06, wpb=436711, bsz=16368, num_updates=57200, lr=0.000264443, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56181 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 epoch 034: 1653 / 1689 loss=3.96, nll_loss=2.318, ppl=4.99, wps=464289, ups=1.07, wpb=435182, bsz=16307.9, num_updates=57300, lr=0.000264212, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=92, gb_free=19, wall=56274 end of epoch 34 (average epoch stats below) epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 epoch 034 | loss 3.944 | nll_loss 2.3 | ppl 4.93 | wps 448989 | ups 1.04 | wpb 433540 | bsz 16501.3 | num_updates 57336 | lr 0.000264129 | gnorm 0.216 | clip 0 | loss_scale 0.5 | train_wall 1572 | gb_free 20.2 | wall 56307 Start iterating over samples epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 64 / 1689 loss=3.936, nll_loss=2.29, ppl=4.89, wps=455655, ups=1.06, wpb=430985, bsz=16363.4, num_updates=57400, lr=0.000263982, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56369 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 164 / 1689 loss=3.923, nll_loss=2.275, ppl=4.84, wps=455670, ups=1.05, wpb=432533, bsz=16402.1, num_updates=57500, lr=0.000263752, gnorm=0.205, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=56464 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 264 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=456700, ups=1.05, wpb=433934, bsz=16556.8, num_updates=57600, lr=0.000263523, gnorm=0.227, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.2, wall=56559 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 364 / 1689 loss=3.924, nll_loss=2.277, ppl=4.85, wps=457041, ups=1.05, wpb=434185, bsz=16586.7, num_updates=57700, lr=0.000263295, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=56654 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 465 / 1689 loss=3.946, nll_loss=2.301, ppl=4.93, wps=457802, ups=1.05, wpb=435244, bsz=16435, num_updates=57800, lr=0.000263067, gnorm=0.219, clip=0, loss_scale=0.5, train_wall=93, gb_free=19, wall=56749 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 565 / 1689 loss=3.929, nll_loss=2.283, ppl=4.87, wps=459189, ups=1.06, wpb=434872, bsz=16364.2, num_updates=57900, lr=0.00026284, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.4, wall=56844 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 epoch 035: 665 / 1689 loss=3.94, nll_loss=2.295, ppl=4.91, wps=460192, ups=1.06, wpb=434177, bsz=16393.1, num_updates=58000, lr=0.000262613, gnorm=0.218, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.9, wall=56938 begin validation on "valid" subset epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.237 | nll_loss 2.617 | ppl 6.13 | wps 0 | wpb 42662 | bsz 2032 | num_updates 58000 | best_loss 4.237 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 765 / 1689 loss=3.945, nll_loss=2.301, ppl=4.93, wps=372474, ups=0.85, wpb=436118, bsz=16913.4, num_updates=58100, lr=0.000262387, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=95, gb_free=18.7, wall=57055 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 865 / 1689 loss=3.934, nll_loss=2.289, ppl=4.89, wps=458346, ups=1.06, wpb=431717, bsz=16642.2, num_updates=58200, lr=0.000262161, gnorm=0.207, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=57149 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 967 / 1689 loss=3.939, nll_loss=2.295, ppl=4.91, wps=451613, ups=1.04, wpb=433220, bsz=16671.4, num_updates=58300, lr=0.000261936, gnorm=0.219, clip=0, loss_scale=0.25, train_wall=94, gb_free=17.9, wall=57245 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1067 / 1689 loss=3.953, nll_loss=2.31, ppl=4.96, wps=462367, ups=1.06, wpb=435124, bsz=16228.6, num_updates=58400, lr=0.000261712, gnorm=0.213, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.4, wall=57339 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1167 / 1689 loss=3.954, nll_loss=2.311, ppl=4.96, wps=459402, ups=1.06, wpb=432239, bsz=16290.8, num_updates=58500, lr=0.000261488, gnorm=0.211, clip=0, loss_scale=0.25, train_wall=92, gb_free=19.9, wall=57434 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1267 / 1689 loss=3.944, nll_loss=2.3, ppl=4.93, wps=454552, ups=1.05, wpb=430860, bsz=16199.8, num_updates=58600, lr=0.000261265, gnorm=0.223, clip=0, loss_scale=0.25, train_wall=93, gb_free=18.5, wall=57528 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1367 / 1689 loss=3.94, nll_loss=2.296, ppl=4.91, wps=458686, ups=1.06, wpb=433071, bsz=16384.7, num_updates=58700, lr=0.000261042, gnorm=0.218, clip=0, loss_scale=0.25, train_wall=93, gb_free=19.1, wall=57623 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1467 / 1689 loss=3.955, nll_loss=2.313, ppl=4.97, wps=456677, ups=1.05, wpb=435545, bsz=16625.8, num_updates=58800, lr=0.00026082, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=57718 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1567 / 1689 loss=3.949, nll_loss=2.307, ppl=4.95, wps=456380, ups=1.05, wpb=434224, bsz=16881.7, num_updates=58900, lr=0.000260599, gnorm=0.226, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.2, wall=57813 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 epoch 035: 1667 / 1689 loss=3.957, nll_loss=2.316, ppl=4.98, wps=458417, ups=1.06, wpb=432273, bsz=16713.2, num_updates=59000, lr=0.000260378, gnorm=0.211, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.6, wall=57908 begin validation on "valid" subset epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 epoch 035 | valid on 'valid' subset | loss 4.243 | nll_loss 2.62 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 59000 | best_loss 4.237 end of epoch 35 (average epoch stats below) epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 epoch 035 | loss 3.94 | nll_loss 2.296 | ppl 4.91 | wps 447548 | ups 1.03 | wpb 433554 | bsz 16506.8 | num_updates 59022 | lr 0.000260329 | gnorm 0.217 | clip 0 | loss_scale 0.5 | train_wall 1571 | gb_free 21.7 | wall 57941 Start iterating over samples epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 78 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=400107, ups=0.93, wpb=431503, bsz=16665, num_updates=59100, lr=0.000260157, gnorm=0.215, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.1, wall=58015 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 178 / 1689 loss=3.915, nll_loss=2.267, ppl=4.81, wps=463868, ups=1.07, wpb=434140, bsz=16561.8, num_updates=59200, lr=0.000259938, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.3, wall=58109 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 278 / 1689 loss=3.92, nll_loss=2.272, ppl=4.83, wps=461690, ups=1.07, wpb=432429, bsz=16281.8, num_updates=59300, lr=0.000259718, gnorm=0.204, clip=0, loss_scale=1, train_wall=93, gb_free=18.8, wall=58203 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 378 / 1689 loss=3.922, nll_loss=2.275, ppl=4.84, wps=457930, ups=1.06, wpb=433292, bsz=16626.4, num_updates=59400, lr=0.0002595, gnorm=0.216, clip=0, loss_scale=1, train_wall=94, gb_free=19, wall=58297 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 479 / 1689 loss=3.926, nll_loss=2.28, ppl=4.86, wps=456925, ups=1.06, wpb=432156, bsz=16595.6, num_updates=59500, lr=0.000259281, gnorm=0.223, clip=0, loss_scale=0.5, train_wall=94, gb_free=19.4, wall=58392 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 579 / 1689 loss=3.925, nll_loss=2.278, ppl=4.85, wps=458999, ups=1.06, wpb=431311, bsz=16504.8, num_updates=59600, lr=0.000259064, gnorm=0.222, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.9, wall=58486 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 679 / 1689 loss=3.932, nll_loss=2.286, ppl=4.88, wps=462070, ups=1.07, wpb=431892, bsz=16184.9, num_updates=59700, lr=0.000258847, gnorm=0.213, clip=0, loss_scale=0.5, train_wall=92, gb_free=19.8, wall=58579 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 779 / 1689 loss=3.937, nll_loss=2.292, ppl=4.9, wps=460042, ups=1.06, wpb=433712, bsz=16166.6, num_updates=59800, lr=0.00025863, gnorm=0.208, clip=0, loss_scale=0.5, train_wall=93, gb_free=18.7, wall=58674 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 879 / 1689 loss=3.933, nll_loss=2.288, ppl=4.88, wps=461734, ups=1.06, wpb=434389, bsz=16472.6, num_updates=59900, lr=0.000258414, gnorm=0.232, clip=0, loss_scale=0.5, train_wall=93, gb_free=19.2, wall=58768 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 epoch 036: 979 / 1689 loss=3.936, nll_loss=2.291, ppl=4.9, wps=460161, ups=1.06, wpb=434435, bsz=16454, num_updates=60000, lr=0.000258199, gnorm=0.21, clip=0, loss_scale=1, train_wall=93, gb_free=19, wall=58862 Stopping training due to num_updates: 60000 >= max_update: 60000 begin validation on "valid" subset epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 epoch 036 | valid on 'valid' subset | loss 4.241 | nll_loss 2.621 | ppl 6.15 | wps 0 | wpb 42662 | bsz 2032 | num_updates 60000 | best_loss 4.237 end of epoch 36 (average epoch stats below) epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 epoch 036 | loss 3.927 | nll_loss 2.28 | ppl 4.86 | wps 449332 | ups 1.04 | wpb 433220 | bsz 16462.6 | num_updates 60000 | lr 0.000258199 | gnorm 0.217 | clip 0 | loss_scale 1 | train_wall 909 | gb_free 19 | wall 58884 done training in 58871.0 seconds