{'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': 'simple', 'log_file': 'chkpt/ja-en.do03.ado01/train.log', 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': 'wmt23', 'azureml_logging': False, 'seed': 0, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 8, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'common_eval': {'_name': None, 'path': None, 'post_process': None, 'quiet': False, 'model_overrides': '{}', 'results_path': None}, 'distributed_training': {'_name': None, 'distributed_world_size': 8, 'distributed_num_procs': 8, 'distributed_rank': 0, 'distributed_backend': 'nccl', 'distributed_init_method': 'tcp://localhost:41372', 'distributed_port': 41372, 'device_id': 0, 'distributed_no_spawn': False, 'ddp_backend': 'pytorch_ddp', 'ddp_comm_hook': 'none', 'bucket_cap_mb': 25, 'fix_batches_to_gpus': False, 'find_unused_parameters': False, 'gradient_as_bucket_view': False, 'fast_stat_sync': False, 'heartbeat_timeout': -1, 'broadcast_buffers': False, 'slowmo_momentum': None, 'slowmo_base_algorithm': 'localsgd', 'localsgd_frequency': 3, 'nprocs_per_node': 8, 'pipeline_model_parallel': False, 'pipeline_balance': None, 'pipeline_devices': None, 'pipeline_chunks': 0, 'pipeline_encoder_balance': None, 'pipeline_encoder_devices': None, 'pipeline_decoder_balance': None, 'pipeline_decoder_devices': None, 'pipeline_checkpoint': 'never', 'zero_sharding': 'none', 'fp16': True, 'memory_efficient_fp16': False, 'tpu': False, 'no_reshard_after_forward': False, 'fp32_reduce_scatter': False, 'cpu_offload': False, 'use_sharded_state': False, 'not_fsdp_flatten_parameters': False}, 'dataset': {'_name': None, 'num_workers': 0, 'skip_invalid_size_inputs_valid_test': False, 'max_tokens': 16384, 'batch_size': None, 'required_batch_size_multiple': 8, 'required_seq_len_multiple': 1, 'dataset_impl': None, 'data_buffer_size': 10, 'train_subset': 'train', 'valid_subset': 'valid', 'combine_valid_subsets': None, 'ignore_unused_valid_subsets': False, 'validate_interval': 100000, 'validate_interval_updates': 0, 'validate_after_updates': 0, 'fixed_validation_seed': None, 'disable_validation': False, 'max_tokens_valid': 16384, 'batch_size_valid': None, 'max_valid_steps': None, 'curriculum': 0, 'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0, 'grouped_shuffling': False, 'update_epoch_batch_itr': False, 'update_ordered_indices_seed': False}, 'optimization': {'_name': None, 'max_epoch': 0, 'max_update': 60000, 'stop_time_hours': 0.0, 'clip_norm': 1.0, 'sentence_avg': False, 'update_freq': [4], 'lr': [0.001], 'stop_min_lr': -1.0, 'use_bmuf': False, 'skip_remainder_batch': False, 'debug_param_names': False}, 'checkpoint': {'_name': None, 'save_dir': 'chkpt/ja-en.do03.ado01', 'restore_file': 'checkpoint_last.pt', 'continue_once': None, 'finetune_from_model': None, 'reset_dataloader': False, 'reset_lr_scheduler': False, 'reset_meters': False, 'reset_optimizer': False, 'optimizer_overrides': '{}', 'save_interval': 100000, 'save_interval_updates': 1000, 'keep_interval_updates': 10, 'keep_interval_updates_pattern': -1, 'keep_last_epochs': -1, 'keep_best_checkpoints': -1, 'no_save': False, 'no_epoch_checkpoints': True, 'no_last_checkpoints': False, 'no_save_optimizer_state': False, 'best_checkpoint_metric': 'loss', 'maximize_best_checkpoint_metric': False, 'patience': -1, 'checkpoint_suffix': '', 'checkpoint_shard_count': 1, 'load_checkpoint_on_all_dp_ranks': False, 'write_checkpoints_asynchronously': False, 'model_parallel_size': 1}, 'bmuf': {'_name': None, 'block_lr': 1.0, 'block_momentum': 0.875, 'global_sync_iter': 50, 'warmup_iterations': 500, 'use_nbm': False, 'average_sync': False, 'distributed_world_size': 8}, 'generation': {'_name': None, 'beam': 5, 'beam_mt': 0, 'nbest': 1, 'max_len_a': 0.0, 'max_len_b': 200, 'max_len_a_mt': 0.0, 'max_len_b_mt': 200, 'min_len': 1, 'match_source_len': False, 'unnormalized': False, 'no_early_stop': False, 'no_beamable_mm': False, 'lenpen': 1.0, 'lenpen_mt': 1.0, 'unkpen': 0.0, 'replace_unk': None, 'sacrebleu': False, 'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, 'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, 'constraints': None, 'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, 'diversity_rate': -1.0, 'print_alignment': None, 'print_step': False, 'lm_path': None, 'lm_weight': 0.0, 'iter_decode_eos_penalty': 0.0, 'iter_decode_max_iter': 10, 'iter_decode_force_max_iter': False, 'iter_decode_with_beam': 1, 'iter_decode_with_external_reranker': False, 'retain_iter_history': False, 'retain_dropout': False, 'retain_dropout_modules': None, 'decoding_format': None, 'no_seed_provided': False, 'eos_token': None}, 'eval_lm': {'_name': None, 'output_word_probs': False, 'output_word_stats': False, 'context_window': 0, 'softmax_batch': 9223372036854775807}, 'interactive': {'_name': None, 'buffer_size': 0, 'input': '-'}, 'model': Namespace(no_progress_bar=False, log_interval=100, log_format='simple', log_file='chkpt/ja-en.do03.ado01/train.log', aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project='wmt23', azureml_logging=False, seed=0, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=True, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=8, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='label_smoothed_cross_entropy', tokenizer=None, bpe=None, optimizer='adam', lr_scheduler='inverse_sqrt', scoring='bleu', task='translation', num_workers=0, skip_invalid_size_inputs_valid_test=False, max_tokens=16384, batch_size=None, required_batch_size_multiple=8, required_seq_len_multiple=1, dataset_impl=None, data_buffer_size=10, train_subset='train', valid_subset='valid', combine_valid_subsets=None, ignore_unused_valid_subsets=False, validate_interval=100000, validate_interval_updates=0, validate_after_updates=0, fixed_validation_seed=None, disable_validation=False, max_tokens_valid=16384, batch_size_valid=None, max_valid_steps=None, curriculum=0, gen_subset='test', num_shards=1, shard_id=0, grouped_shuffling=False, update_epoch_batch_itr=False, update_ordered_indices_seed=False, distributed_world_size=8, distributed_num_procs=8, distributed_rank=0, distributed_backend='nccl', distributed_init_method=None, distributed_port=-1, device_id=0, distributed_no_spawn=False, ddp_backend='pytorch_ddp', ddp_comm_hook='none', bucket_cap_mb=25, fix_batches_to_gpus=False, find_unused_parameters=False, gradient_as_bucket_view=False, fast_stat_sync=False, heartbeat_timeout=-1, broadcast_buffers=False, slowmo_momentum=None, slowmo_base_algorithm='localsgd', localsgd_frequency=3, nprocs_per_node=8, pipeline_model_parallel=False, pipeline_balance=None, pipeline_devices=None, pipeline_chunks=0, pipeline_encoder_balance=None, pipeline_encoder_devices=None, pipeline_decoder_balance=None, pipeline_decoder_devices=None, pipeline_checkpoint='never', zero_sharding='none', no_reshard_after_forward=False, fp32_reduce_scatter=False, cpu_offload=False, use_sharded_state=False, not_fsdp_flatten_parameters=False, arch='transformer_vaswani_wmt_en_de_big', max_epoch=0, max_update=60000, stop_time_hours=0, clip_norm=1.0, sentence_avg=False, update_freq=[4], lr=[0.001], stop_min_lr=-1.0, use_bmuf=False, skip_remainder_batch=False, debug_param_names=False, save_dir='chkpt/ja-en.do03.ado01', restore_file='checkpoint_last.pt', continue_once=None, finetune_from_model=None, reset_dataloader=False, reset_lr_scheduler=False, reset_meters=False, reset_optimizer=False, optimizer_overrides='{}', save_interval=100000, save_interval_updates=1000, keep_interval_updates=10, keep_interval_updates_pattern=-1, keep_last_epochs=-1, keep_best_checkpoints=-1, no_save=False, no_epoch_checkpoints=True, no_last_checkpoints=False, no_save_optimizer_state=False, best_checkpoint_metric='loss', maximize_best_checkpoint_metric=False, patience=-1, checkpoint_suffix='', checkpoint_shard_count=1, load_checkpoint_on_all_dp_ranks=False, write_checkpoints_asynchronously=False, store_ema=False, ema_decay=0.9999, ema_start_update=0, ema_seed_model=None, ema_update_freq=1, ema_fp32=False, data='binarized/ja-en/', source_lang=None, target_lang=None, load_alignments=False, left_pad_source=True, left_pad_target=False, upsample_primary=-1, truncate_source=False, num_batch_buckets=0, eval_bleu=False, eval_bleu_args='{}', eval_bleu_detok='space', eval_bleu_detok_args='{}', eval_tokenized_bleu=False, eval_bleu_remove_bpe=None, eval_bleu_print_samples=False, label_smoothing=0.1, report_accuracy=False, ignore_prefix_size=0, adam_betas='(0.9, 0.98)', adam_eps=1e-08, weight_decay=0.0, use_old_adam=False, fp16_adam_stats=False, warmup_updates=4000, warmup_init_lr=-1, pad=1, eos=2, unk=3, encoder_ffn_embed_dim=8192, decoder_ffn_embed_dim=8192, dropout=0.3, attention_dropout=0.1, share_decoder_input_output_embed=True, no_seed_provided=False, encoder_embed_dim=1024, encoder_attention_heads=16, encoder_normalize_before=False, decoder_embed_dim=1024, decoder_attention_heads=16, encoder_embed_path=None, encoder_layers=6, encoder_learned_pos=False, decoder_embed_path=None, decoder_layers=6, decoder_normalize_before=False, decoder_learned_pos=False, activation_dropout=0.0, activation_fn='relu', adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, share_all_embeddings=False, merge_src_tgt_embed=False, no_token_positional_embeddings=False, adaptive_input=False, no_cross_attention=False, cross_self_attention=False, decoder_output_dim=1024, decoder_input_dim=1024, no_scale_embedding=False, layernorm_embedding=False, tie_adaptive_weights=False, checkpoint_activations=False, offload_activations=False, encoder_layers_to_keep=None, decoder_layers_to_keep=None, encoder_layerdrop=0, decoder_layerdrop=0, quant_noise_pq=0, quant_noise_pq_block_size=8, quant_noise_scalar=0, _name='transformer_vaswani_wmt_en_de_big'), 'task': {'_name': 'translation', 'data': 'binarized/ja-en/', 'source_lang': None, 'target_lang': None, 'load_alignments': False, 'left_pad_source': True, 'left_pad_target': False, 'max_source_positions': 1024, 'max_target_positions': 1024, 'upsample_primary': -1, 'truncate_source': False, 'num_batch_buckets': 0, 'train_subset': 'train', 'dataset_impl': None, 'required_seq_len_multiple': 1, 'eval_bleu': False, 'eval_bleu_args': '{}', 'eval_bleu_detok': 'space', 'eval_bleu_detok_args': '{}', 'eval_tokenized_bleu': False, 'eval_bleu_remove_bpe': None, 'eval_bleu_print_samples': False}, 'criterion': {'_name': 'label_smoothed_cross_entropy', 'label_smoothing': 0.1, 'report_accuracy': False, 'ignore_prefix_size': 0, 'sentence_avg': False}, 'optimizer': {'_name': 'adam', 'adam_betas': '(0.9, 0.98)', 'adam_eps': 1e-08, 'weight_decay': 0.0, 'use_old_adam': False, 'fp16_adam_stats': False, 'tpu': False, 'lr': [0.001]}, 'lr_scheduler': {'_name': 'inverse_sqrt', 'warmup_updates': 4000, 'warmup_init_lr': -1.0, 'lr': [0.001]}, 'scoring': {'_name': 'bleu', 'pad': 1, 'eos': 2, 'unk': 3}, 'bpe': None, 'tokenizer': None, 'ema': {'_name': None, 'store_ema': False, 'ema_decay': 0.9999, 'ema_start_update': 0, 'ema_seed_model': None, 'ema_update_freq': 1, 'ema_fp32': False}} TransformerModel( (encoder): TransformerEncoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(32000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-5): 6 x TransformerEncoderLayerBase( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (dropout_module): FairseqDropout() (activation_dropout_module): FairseqDropout() (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) ) (decoder): TransformerDecoderBase( (dropout_module): FairseqDropout() (embed_tokens): Embedding(16000, 1024, padding_idx=1) (embed_positions): SinusoidalPositionalEmbedding() (layers): ModuleList( (0-5): 6 x TransformerDecoderLayerBase( (dropout_module): FairseqDropout() (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (activation_dropout_module): FairseqDropout() (self_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (encoder_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (encoder_attn_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=8192, bias=True) (fc2): Linear(in_features=8192, out_features=1024, bias=True) (final_layer_norm): FusedLayerNorm(torch.Size([1024]), eps=1e-05, elementwise_affine=True) ) ) (output_projection): Linear(in_features=1024, out_features=16000, bias=False) ) ) task: TranslationTask model: TransformerModel criterion: LabelSmoothedCrossEntropyCriterion num. shared model params: 326,221,824 (num. trained: 326,221,824) num. expert model params: 0 (num. trained: 0) training on 8 devices (GPUs/TPUs) max tokens per device = 16384 and max sentences per device = None begin dry-run validation on "valid" subset Start iterating over samples epoch 001: 101 / 1689 loss=12.175, nll_loss=11.872, ppl=3748.25, wps=543397, ups=1.1, wpb=495110, bsz=16550.1, num_updates=100, lr=2.5e-05, gnorm=2.351, clip=69, loss_scale=4, train_wall=92, gb_free=21, wall=111 epoch 001: 201 / 1689 loss=10.627, nll_loss=10.094, ppl=1093.11, wps=550479, ups=1.11, wpb=494772, bsz=16958.6, num_updates=200, lr=5e-05, gnorm=1.772, clip=92, loss_scale=4, train_wall=89, gb_free=21.1, wall=201 epoch 001: 301 / 1689 loss=10.022, nll_loss=9.374, ppl=663.58, wps=552085, ups=1.11, wpb=496328, bsz=16644.9, num_updates=300, lr=7.5e-05, gnorm=1.894, clip=99, loss_scale=4, train_wall=89, gb_free=21.7, wall=290 epoch 001: 401 / 1689 loss=9.424, nll_loss=8.668, ppl=406.86, wps=549608, ups=1.11, wpb=495021, bsz=16565.7, num_updates=400, lr=0.0001, gnorm=1.715, clip=98, loss_scale=4, train_wall=89, gb_free=21.7, wall=381 epoch 001: 501 / 1689 loss=8.992, nll_loss=8.155, ppl=285.05, wps=551373, ups=1.11, wpb=495038, bsz=16610.6, num_updates=500, lr=0.000125, gnorm=1.573, clip=99, loss_scale=4, train_wall=89, gb_free=21.5, wall=470 epoch 001: 601 / 1689 loss=8.659, nll_loss=7.761, ppl=216.86, wps=546032, ups=1.1, wpb=495492, bsz=16496.7, num_updates=600, lr=0.00015, gnorm=1.407, clip=97, loss_scale=8, train_wall=90, gb_free=17.5, wall=561 epoch 001: 701 / 1689 loss=8.328, nll_loss=7.374, ppl=165.93, wps=542612, ups=1.1, wpb=494852, bsz=16337.6, num_updates=700, lr=0.000175, gnorm=1.276, clip=93, loss_scale=8, train_wall=90, gb_free=22.2, wall=652 epoch 001: 801 / 1689 loss=8.012, nll_loss=7.007, ppl=128.64, wps=549560, ups=1.11, wpb=496124, bsz=16372.8, num_updates=800, lr=0.0002, gnorm=1.15, clip=81, loss_scale=8, train_wall=89, gb_free=21.7, wall=743 epoch 001: 901 / 1689 loss=7.705, nll_loss=6.653, ppl=100.63, wps=547443, ups=1.1, wpb=496813, bsz=16599.5, num_updates=900, lr=0.000225, gnorm=1.067, clip=69, loss_scale=8, train_wall=89, gb_free=21.7, wall=833 epoch 001: 1001 / 1689 loss=7.419, nll_loss=6.322, ppl=80.03, wps=551555, ups=1.11, wpb=496035, bsz=16688, num_updates=1000, lr=0.00025, gnorm=1.032, clip=50, loss_scale=8, train_wall=88, gb_free=21.8, wall=923 begin validation on "valid" subset epoch 001 | valid on 'valid' subset | loss 7.126 | nll_loss 5.938 | ppl 61.29 | wps 0 | wpb 44526 | bsz 2008 | num_updates 1000 epoch 001: 1102 / 1689 loss=7.145, nll_loss=6.005, ppl=64.24, wps=458560, ups=0.93, wpb=494391, bsz=16510, num_updates=1100, lr=0.000275, gnorm=0.942, clip=32, loss_scale=8, train_wall=89, gb_free=19.6, wall=1031 epoch 001: 1202 / 1689 loss=6.863, nll_loss=5.681, ppl=51.29, wps=553430, ups=1.12, wpb=494853, bsz=16237.2, num_updates=1200, lr=0.0003, gnorm=0.876, clip=18, loss_scale=8, train_wall=88, gb_free=21.9, wall=1120 epoch 001: 1302 / 1689 loss=6.562, nll_loss=5.336, ppl=40.4, wps=544407, ups=1.1, wpb=493588, bsz=16373.8, num_updates=1300, lr=0.000325, gnorm=0.887, clip=26, loss_scale=8, train_wall=89, gb_free=21, wall=1211 epoch 001: 1403 / 1689 loss=6.25, nll_loss=4.978, ppl=31.53, wps=544127, ups=1.09, wpb=497354, bsz=16288.3, num_updates=1400, lr=0.00035, gnorm=0.859, clip=20, loss_scale=4, train_wall=90, gb_free=21.7, wall=1303 epoch 001: 1503 / 1689 loss=5.942, nll_loss=4.628, ppl=24.73, wps=547061, ups=1.11, wpb=494465, bsz=16539.4, num_updates=1500, lr=0.000375, gnorm=0.804, clip=11, loss_scale=4, train_wall=89, gb_free=22, wall=1393 epoch 001: 1603 / 1689 loss=5.68, nll_loss=4.331, ppl=20.13, wps=544762, ups=1.1, wpb=495818, bsz=16567.8, num_updates=1600, lr=0.0004, gnorm=0.741, clip=10, loss_scale=4, train_wall=89, gb_free=21.1, wall=1484 end of epoch 1 (average epoch stats below) epoch 001 | loss 7.981 | nll_loss 6.988 | ppl 126.94 | wps 541326 | ups 1.09 | wpb 495115 | bsz 16504.7 | num_updates 1686 | lr 0.0004215 | gnorm 1.241 | clip 57.5 | loss_scale 4 | train_wall 1506 | gb_free 20.9 | wall 1562 Start iterating over samples epoch 002: 14 / 1689 loss=5.49, nll_loss=4.117, ppl=17.36, wps=538882, ups=1.1, wpb=490777, bsz=16322.4, num_updates=1700, lr=0.000425, gnorm=0.676, clip=6, loss_scale=4, train_wall=89, gb_free=21.6, wall=1575 epoch 002: 14 / 1689 loss=5.49, nll_loss=4.117, ppl=17.36, wps=538882, ups=1.1, wpb=490777, bsz=16322.4, num_updates=1700, lr=0.000425, gnorm=0.676, clip=6, loss_scale=4, train_wall=89, gb_free=21.6, wall=1575 epoch 002: 114 / 1689 loss=5.309, nll_loss=3.915, ppl=15.09, wps=542796, ups=1.09, wpb=496688, bsz=16760.9, num_updates=1800, lr=0.00045, gnorm=0.596, clip=1, loss_scale=4, train_wall=90, gb_free=21.5, wall=1667 epoch 002: 114 / 1689 loss=5.309, nll_loss=3.915, ppl=15.09, wps=542796, ups=1.09, wpb=496688, bsz=16760.9, num_updates=1800, lr=0.00045, gnorm=0.596, clip=1, loss_scale=4, train_wall=90, gb_free=21.5, wall=1667 epoch 002: 214 / 1689 loss=5.205, nll_loss=3.8, ppl=13.93, wps=546651, ups=1.11, wpb=494526, bsz=16524.7, num_updates=1900, lr=0.000475, gnorm=0.623, clip=4, loss_scale=8, train_wall=89, gb_free=21.8, wall=1757 epoch 002: 214 / 1689 loss=5.205, nll_loss=3.8, ppl=13.93, wps=546651, ups=1.11, wpb=494526, bsz=16524.7, num_updates=1900, lr=0.000475, gnorm=0.623, clip=4, loss_scale=8, train_wall=89, gb_free=21.8, wall=1757 epoch 002: 314 / 1689 loss=5.065, nll_loss=3.645, ppl=12.51, wps=544912, ups=1.1, wpb=494140, bsz=16833.4, num_updates=2000, lr=0.0005, gnorm=0.521, clip=0, loss_scale=8, train_wall=89, gb_free=20.9, wall=1848 epoch 002: 314 / 1689 loss=5.065, nll_loss=3.645, ppl=12.51, wps=544912, ups=1.1, wpb=494140, bsz=16833.4, num_updates=2000, lr=0.0005, gnorm=0.521, clip=0, loss_scale=8, train_wall=89, gb_free=20.9, wall=1848 begin validation on "valid" subset epoch 002 | valid on 'valid' subset | loss 5.021 | nll_loss 3.483 | ppl 11.18 | wps 0 | wpb 44526 | bsz 2008 | num_updates 2000 | best_loss 5.021 epoch 002 | valid on 'valid' subset | loss 5.021 | nll_loss 3.483 | ppl 11.18 | wps 0 | wpb 44526 | bsz 2008 | num_updates 2000 | best_loss 5.021 epoch 002: 414 / 1689 loss=4.969, nll_loss=3.539, ppl=11.62, wps=452652, ups=0.91, wpb=495220, bsz=16290.6, num_updates=2100, lr=0.000525, gnorm=0.533, clip=1, loss_scale=8, train_wall=89, gb_free=21.7, wall=1957 epoch 002: 414 / 1689 loss=4.969, nll_loss=3.539, ppl=11.62, wps=452652, ups=0.91, wpb=495220, bsz=16290.6, num_updates=2100, lr=0.000525, gnorm=0.533, clip=1, loss_scale=8, train_wall=89, gb_free=21.7, wall=1957 epoch 002: 514 / 1689 loss=4.887, nll_loss=3.449, ppl=10.92, wps=550055, ups=1.11, wpb=494814, bsz=16546.7, num_updates=2200, lr=0.00055, gnorm=0.503, clip=0, loss_scale=8, train_wall=88, gb_free=21.8, wall=2047 epoch 002: 514 / 1689 loss=4.887, nll_loss=3.449, ppl=10.92, wps=550055, ups=1.11, wpb=494814, bsz=16546.7, num_updates=2200, lr=0.00055, gnorm=0.503, clip=0, loss_scale=8, train_wall=88, gb_free=21.8, wall=2047 epoch 002: 615 / 1689 loss=4.797, nll_loss=3.351, ppl=10.2, wps=548502, ups=1.11, wpb=495671, bsz=16609.1, num_updates=2300, lr=0.000575, gnorm=0.441, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=2137 epoch 002: 615 / 1689 loss=4.797, nll_loss=3.351, ppl=10.2, wps=548502, ups=1.11, wpb=495671, bsz=16609.1, num_updates=2300, lr=0.000575, gnorm=0.441, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=2137 epoch 002: 715 / 1689 loss=4.753, nll_loss=3.303, ppl=9.87, wps=555350, ups=1.12, wpb=495264, bsz=16162, num_updates=2400, lr=0.0006, gnorm=0.467, clip=1, loss_scale=4, train_wall=88, gb_free=21, wall=2227 epoch 002: 715 / 1689 loss=4.753, nll_loss=3.303, ppl=9.87, wps=555350, ups=1.12, wpb=495264, bsz=16162, num_updates=2400, lr=0.0006, gnorm=0.467, clip=1, loss_scale=4, train_wall=88, gb_free=21, wall=2227 epoch 002: 815 / 1689 loss=4.675, nll_loss=3.218, ppl=9.3, wps=548966, ups=1.11, wpb=494583, bsz=16776.5, num_updates=2500, lr=0.000625, gnorm=0.43, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=2317 epoch 002: 815 / 1689 loss=4.675, nll_loss=3.218, ppl=9.3, wps=548966, ups=1.11, wpb=494583, bsz=16776.5, num_updates=2500, lr=0.000625, gnorm=0.43, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=2317 epoch 002: 915 / 1689 loss=4.634, nll_loss=3.173, ppl=9.02, wps=549416, ups=1.11, wpb=495507, bsz=16601.8, num_updates=2600, lr=0.00065, gnorm=0.449, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=2407 epoch 002: 915 / 1689 loss=4.634, nll_loss=3.173, ppl=9.02, wps=549416, ups=1.11, wpb=495507, bsz=16601.8, num_updates=2600, lr=0.00065, gnorm=0.449, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=2407 epoch 002: 1015 / 1689 loss=4.582, nll_loss=3.116, ppl=8.67, wps=547890, ups=1.11, wpb=495244, bsz=16349.9, num_updates=2700, lr=0.000675, gnorm=0.427, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=2497 epoch 002: 1015 / 1689 loss=4.582, nll_loss=3.116, ppl=8.67, wps=547890, ups=1.11, wpb=495244, bsz=16349.9, num_updates=2700, lr=0.000675, gnorm=0.427, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=2497 epoch 002: 1115 / 1689 loss=4.546, nll_loss=3.078, ppl=8.44, wps=546581, ups=1.1, wpb=495113, bsz=16385.7, num_updates=2800, lr=0.0007, gnorm=0.419, clip=0, loss_scale=8, train_wall=89, gb_free=21.6, wall=2588 epoch 002: 1115 / 1689 loss=4.546, nll_loss=3.078, ppl=8.44, wps=546581, ups=1.1, wpb=495113, bsz=16385.7, num_updates=2800, lr=0.0007, gnorm=0.419, clip=0, loss_scale=8, train_wall=89, gb_free=21.6, wall=2588 epoch 002: 1216 / 1689 loss=4.508, nll_loss=3.037, ppl=8.21, wps=537554, ups=1.08, wpb=495908, bsz=16500.9, num_updates=2900, lr=0.000725, gnorm=0.396, clip=0, loss_scale=4, train_wall=91, gb_free=21.5, wall=2680 epoch 002: 1216 / 1689 loss=4.508, nll_loss=3.037, ppl=8.21, wps=537554, ups=1.08, wpb=495908, bsz=16500.9, num_updates=2900, lr=0.000725, gnorm=0.396, clip=0, loss_scale=4, train_wall=91, gb_free=21.5, wall=2680 epoch 002: 1316 / 1689 loss=4.471, nll_loss=2.997, ppl=7.98, wps=551352, ups=1.11, wpb=496192, bsz=16642.6, num_updates=3000, lr=0.00075, gnorm=0.391, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=2770 epoch 002: 1316 / 1689 loss=4.471, nll_loss=2.997, ppl=7.98, wps=551352, ups=1.11, wpb=496192, bsz=16642.6, num_updates=3000, lr=0.00075, gnorm=0.391, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=2770 begin validation on "valid" subset epoch 002 | valid on 'valid' subset | loss 4.422 | nll_loss 2.846 | ppl 7.19 | wps 0 | wpb 44526 | bsz 2008 | num_updates 3000 | best_loss 4.422 epoch 002 | valid on 'valid' subset | loss 4.422 | nll_loss 2.846 | ppl 7.19 | wps 0 | wpb 44526 | bsz 2008 | num_updates 3000 | best_loss 4.422 epoch 002: 1416 / 1689 loss=4.439, nll_loss=2.961, ppl=7.79, wps=448842, ups=0.91, wpb=494703, bsz=16277.2, num_updates=3100, lr=0.000775, gnorm=0.4, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=2880 epoch 002: 1416 / 1689 loss=4.439, nll_loss=2.961, ppl=7.79, wps=448842, ups=0.91, wpb=494703, bsz=16277.2, num_updates=3100, lr=0.000775, gnorm=0.4, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=2880 epoch 002: 1516 / 1689 loss=4.42, nll_loss=2.941, ppl=7.68, wps=553938, ups=1.12, wpb=495594, bsz=16465.1, num_updates=3200, lr=0.0008, gnorm=0.389, clip=0, loss_scale=4, train_wall=88, gb_free=22.9, wall=2970 epoch 002: 1516 / 1689 loss=4.42, nll_loss=2.941, ppl=7.68, wps=553938, ups=1.12, wpb=495594, bsz=16465.1, num_updates=3200, lr=0.0008, gnorm=0.389, clip=0, loss_scale=4, train_wall=88, gb_free=22.9, wall=2970 epoch 002: 1616 / 1689 loss=4.39, nll_loss=2.91, ppl=7.52, wps=546279, ups=1.1, wpb=497131, bsz=16340.2, num_updates=3300, lr=0.000825, gnorm=0.385, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=3061 epoch 002: 1616 / 1689 loss=4.39, nll_loss=2.91, ppl=7.52, wps=546279, ups=1.1, wpb=497131, bsz=16340.2, num_updates=3300, lr=0.000825, gnorm=0.385, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=3061 end of epoch 2 (average epoch stats below) epoch 002 | loss 4.719 | nll_loss 3.267 | ppl 9.63 | wps 533898 | ups 1.08 | wpb 495122 | bsz 16504.6 | num_updates 3373 | lr 0.00084325 | gnorm 0.46 | clip 0.4 | loss_scale 4 | train_wall 1499 | gb_free 25.6 | wall 3126 epoch 002 | loss 4.719 | nll_loss 3.267 | ppl 9.63 | wps 533898 | ups 1.08 | wpb 495122 | bsz 16504.6 | num_updates 3373 | lr 0.00084325 | gnorm 0.46 | clip 0.4 | loss_scale 4 | train_wall 1499 | gb_free 25.6 | wall 3126 Start iterating over samples epoch 003: 27 / 1689 loss=4.383, nll_loss=2.902, ppl=7.47, wps=543332, ups=1.11, wpb=490484, bsz=16481, num_updates=3400, lr=0.00085, gnorm=0.4, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=3151 epoch 003: 27 / 1689 loss=4.383, nll_loss=2.902, ppl=7.47, wps=543332, ups=1.11, wpb=490484, bsz=16481, num_updates=3400, lr=0.00085, gnorm=0.4, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=3151 epoch 003: 27 / 1689 loss=4.383, nll_loss=2.902, ppl=7.47, wps=543332, ups=1.11, wpb=490484, bsz=16481, num_updates=3400, lr=0.00085, gnorm=0.4, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=3151 epoch 003: 128 / 1689 loss=4.342, nll_loss=2.858, ppl=7.25, wps=542660, ups=1.1, wpb=495054, bsz=16812.7, num_updates=3500, lr=0.000875, gnorm=0.375, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=3242 epoch 003: 128 / 1689 loss=4.342, nll_loss=2.858, ppl=7.25, wps=542660, ups=1.1, wpb=495054, bsz=16812.7, num_updates=3500, lr=0.000875, gnorm=0.375, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=3242 epoch 003: 128 / 1689 loss=4.342, nll_loss=2.858, ppl=7.25, wps=542660, ups=1.1, wpb=495054, bsz=16812.7, num_updates=3500, lr=0.000875, gnorm=0.375, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=3242 epoch 003: 228 / 1689 loss=4.335, nll_loss=2.85, ppl=7.21, wps=545175, ups=1.1, wpb=495438, bsz=16848.2, num_updates=3600, lr=0.0009, gnorm=0.38, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=3333 epoch 003: 228 / 1689 loss=4.335, nll_loss=2.85, ppl=7.21, wps=545175, ups=1.1, wpb=495438, bsz=16848.2, num_updates=3600, lr=0.0009, gnorm=0.38, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=3333 epoch 003: 228 / 1689 loss=4.335, nll_loss=2.85, ppl=7.21, wps=545175, ups=1.1, wpb=495438, bsz=16848.2, num_updates=3600, lr=0.0009, gnorm=0.38, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=3333 epoch 003: 328 / 1689 loss=4.321, nll_loss=2.835, ppl=7.14, wps=543036, ups=1.1, wpb=494814, bsz=16504.7, num_updates=3700, lr=0.000925, gnorm=0.38, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=3424 epoch 003: 328 / 1689 loss=4.321, nll_loss=2.835, ppl=7.14, wps=543036, ups=1.1, wpb=494814, bsz=16504.7, num_updates=3700, lr=0.000925, gnorm=0.38, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=3424 epoch 003: 328 / 1689 loss=4.321, nll_loss=2.835, ppl=7.14, wps=543036, ups=1.1, wpb=494814, bsz=16504.7, num_updates=3700, lr=0.000925, gnorm=0.38, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=3424 epoch 003: 428 / 1689 loss=4.313, nll_loss=2.827, ppl=7.1, wps=547688, ups=1.11, wpb=494677, bsz=16399.4, num_updates=3800, lr=0.00095, gnorm=0.385, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=3515 epoch 003: 428 / 1689 loss=4.313, nll_loss=2.827, ppl=7.1, wps=547688, ups=1.11, wpb=494677, bsz=16399.4, num_updates=3800, lr=0.00095, gnorm=0.385, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=3515 epoch 003: 428 / 1689 loss=4.313, nll_loss=2.827, ppl=7.1, wps=547688, ups=1.11, wpb=494677, bsz=16399.4, num_updates=3800, lr=0.00095, gnorm=0.385, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=3515 epoch 003: 528 / 1689 loss=4.293, nll_loss=2.805, ppl=6.99, wps=550150, ups=1.11, wpb=496106, bsz=16563.3, num_updates=3900, lr=0.000975, gnorm=0.385, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=3605 epoch 003: 528 / 1689 loss=4.293, nll_loss=2.805, ppl=6.99, wps=550150, ups=1.11, wpb=496106, bsz=16563.3, num_updates=3900, lr=0.000975, gnorm=0.385, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=3605 epoch 003: 528 / 1689 loss=4.293, nll_loss=2.805, ppl=6.99, wps=550150, ups=1.11, wpb=496106, bsz=16563.3, num_updates=3900, lr=0.000975, gnorm=0.385, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=3605 epoch 003: 628 / 1689 loss=4.283, nll_loss=2.795, ppl=6.94, wps=552866, ups=1.11, wpb=495915, bsz=16475.4, num_updates=4000, lr=0.001, gnorm=0.372, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=3695 epoch 003: 628 / 1689 loss=4.283, nll_loss=2.795, ppl=6.94, wps=552866, ups=1.11, wpb=495915, bsz=16475.4, num_updates=4000, lr=0.001, gnorm=0.372, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=3695 epoch 003: 628 / 1689 loss=4.283, nll_loss=2.795, ppl=6.94, wps=552866, ups=1.11, wpb=495915, bsz=16475.4, num_updates=4000, lr=0.001, gnorm=0.372, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=3695 begin validation on "valid" subset epoch 003 | valid on 'valid' subset | loss 4.291 | nll_loss 2.71 | ppl 6.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 4000 | best_loss 4.291 epoch 003 | valid on 'valid' subset | loss 4.291 | nll_loss 2.71 | ppl 6.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 4000 | best_loss 4.291 epoch 003 | valid on 'valid' subset | loss 4.291 | nll_loss 2.71 | ppl 6.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 4000 | best_loss 4.291 epoch 003: 729 / 1689 loss=4.278, nll_loss=2.789, ppl=6.91, wps=455573, ups=0.92, wpb=496074, bsz=16277.8, num_updates=4100, lr=0.00098773, gnorm=0.369, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=3803 epoch 003: 729 / 1689 loss=4.278, nll_loss=2.789, ppl=6.91, wps=455573, ups=0.92, wpb=496074, bsz=16277.8, num_updates=4100, lr=0.00098773, gnorm=0.369, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=3803 epoch 003: 729 / 1689 loss=4.278, nll_loss=2.789, ppl=6.91, wps=455573, ups=0.92, wpb=496074, bsz=16277.8, num_updates=4100, lr=0.00098773, gnorm=0.369, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=3803 epoch 003: 829 / 1689 loss=4.264, nll_loss=2.775, ppl=6.84, wps=546132, ups=1.11, wpb=493044, bsz=16311.8, num_updates=4200, lr=0.0009759, gnorm=0.362, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=3894 epoch 003: 829 / 1689 loss=4.264, nll_loss=2.775, ppl=6.84, wps=546132, ups=1.11, wpb=493044, bsz=16311.8, num_updates=4200, lr=0.0009759, gnorm=0.362, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=3894 epoch 003: 829 / 1689 loss=4.264, nll_loss=2.775, ppl=6.84, wps=546132, ups=1.11, wpb=493044, bsz=16311.8, num_updates=4200, lr=0.0009759, gnorm=0.362, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=3894 epoch 003: 929 / 1689 loss=4.248, nll_loss=2.757, ppl=6.76, wps=545829, ups=1.1, wpb=494194, bsz=16361.3, num_updates=4300, lr=0.000964486, gnorm=0.351, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=3984 epoch 003: 929 / 1689 loss=4.248, nll_loss=2.757, ppl=6.76, wps=545829, ups=1.1, wpb=494194, bsz=16361.3, num_updates=4300, lr=0.000964486, gnorm=0.351, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=3984 epoch 003: 929 / 1689 loss=4.248, nll_loss=2.757, ppl=6.76, wps=545829, ups=1.1, wpb=494194, bsz=16361.3, num_updates=4300, lr=0.000964486, gnorm=0.351, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=3984 epoch 003: 1029 / 1689 loss=4.239, nll_loss=2.748, ppl=6.72, wps=551020, ups=1.11, wpb=496554, bsz=16440.3, num_updates=4400, lr=0.000953463, gnorm=0.346, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=4074 epoch 003: 1029 / 1689 loss=4.239, nll_loss=2.748, ppl=6.72, wps=551020, ups=1.11, wpb=496554, bsz=16440.3, num_updates=4400, lr=0.000953463, gnorm=0.346, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=4074 epoch 003: 1029 / 1689 loss=4.239, nll_loss=2.748, ppl=6.72, wps=551020, ups=1.11, wpb=496554, bsz=16440.3, num_updates=4400, lr=0.000953463, gnorm=0.346, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=4074 epoch 003: 1129 / 1689 loss=4.215, nll_loss=2.722, ppl=6.6, wps=541243, ups=1.09, wpb=496113, bsz=16938.9, num_updates=4500, lr=0.000942809, gnorm=0.338, clip=0, loss_scale=4, train_wall=90, gb_free=22, wall=4166 epoch 003: 1129 / 1689 loss=4.215, nll_loss=2.722, ppl=6.6, wps=541243, ups=1.09, wpb=496113, bsz=16938.9, num_updates=4500, lr=0.000942809, gnorm=0.338, clip=0, loss_scale=4, train_wall=90, gb_free=22, wall=4166 epoch 003: 1129 / 1689 loss=4.215, nll_loss=2.722, ppl=6.6, wps=541243, ups=1.09, wpb=496113, bsz=16938.9, num_updates=4500, lr=0.000942809, gnorm=0.338, clip=0, loss_scale=4, train_wall=90, gb_free=22, wall=4166 epoch 003: 1229 / 1689 loss=4.211, nll_loss=2.718, ppl=6.58, wps=547713, ups=1.11, wpb=494048, bsz=16474.2, num_updates=4600, lr=0.000932505, gnorm=0.341, clip=0, loss_scale=8, train_wall=88, gb_free=21.9, wall=4256 epoch 003: 1229 / 1689 loss=4.211, nll_loss=2.718, ppl=6.58, wps=547713, ups=1.11, wpb=494048, bsz=16474.2, num_updates=4600, lr=0.000932505, gnorm=0.341, clip=0, loss_scale=8, train_wall=88, gb_free=21.9, wall=4256 epoch 003: 1229 / 1689 loss=4.211, nll_loss=2.718, ppl=6.58, wps=547713, ups=1.11, wpb=494048, bsz=16474.2, num_updates=4600, lr=0.000932505, gnorm=0.341, clip=0, loss_scale=8, train_wall=88, gb_free=21.9, wall=4256 epoch 003: 1329 / 1689 loss=4.198, nll_loss=2.704, ppl=6.52, wps=551914, ups=1.11, wpb=495822, bsz=16321, num_updates=4700, lr=0.000922531, gnorm=0.327, clip=0, loss_scale=8, train_wall=88, gb_free=21.4, wall=4346 epoch 003: 1329 / 1689 loss=4.198, nll_loss=2.704, ppl=6.52, wps=551914, ups=1.11, wpb=495822, bsz=16321, num_updates=4700, lr=0.000922531, gnorm=0.327, clip=0, loss_scale=8, train_wall=88, gb_free=21.4, wall=4346 epoch 003: 1329 / 1689 loss=4.198, nll_loss=2.704, ppl=6.52, wps=551914, ups=1.11, wpb=495822, bsz=16321, num_updates=4700, lr=0.000922531, gnorm=0.327, clip=0, loss_scale=8, train_wall=88, gb_free=21.4, wall=4346 epoch 003: 1430 / 1689 loss=4.183, nll_loss=2.688, ppl=6.44, wps=549779, ups=1.11, wpb=496349, bsz=16693, num_updates=4800, lr=0.000912871, gnorm=0.322, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=4436 epoch 003: 1430 / 1689 loss=4.183, nll_loss=2.688, ppl=6.44, wps=549779, ups=1.11, wpb=496349, bsz=16693, num_updates=4800, lr=0.000912871, gnorm=0.322, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=4436 epoch 003: 1430 / 1689 loss=4.183, nll_loss=2.688, ppl=6.44, wps=549779, ups=1.11, wpb=496349, bsz=16693, num_updates=4800, lr=0.000912871, gnorm=0.322, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=4436 epoch 003: 1530 / 1689 loss=4.172, nll_loss=2.676, ppl=6.39, wps=551674, ups=1.11, wpb=495935, bsz=16441.5, num_updates=4900, lr=0.000903508, gnorm=0.329, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=4526 epoch 003: 1530 / 1689 loss=4.172, nll_loss=2.676, ppl=6.39, wps=551674, ups=1.11, wpb=495935, bsz=16441.5, num_updates=4900, lr=0.000903508, gnorm=0.329, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=4526 epoch 003: 1530 / 1689 loss=4.172, nll_loss=2.676, ppl=6.39, wps=551674, ups=1.11, wpb=495935, bsz=16441.5, num_updates=4900, lr=0.000903508, gnorm=0.329, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=4526 epoch 003: 1630 / 1689 loss=4.167, nll_loss=2.671, ppl=6.37, wps=548968, ups=1.11, wpb=495488, bsz=16111.9, num_updates=5000, lr=0.000894427, gnorm=0.317, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=4616 epoch 003: 1630 / 1689 loss=4.167, nll_loss=2.671, ppl=6.37, wps=548968, ups=1.11, wpb=495488, bsz=16111.9, num_updates=5000, lr=0.000894427, gnorm=0.317, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=4616 epoch 003: 1630 / 1689 loss=4.167, nll_loss=2.671, ppl=6.37, wps=548968, ups=1.11, wpb=495488, bsz=16111.9, num_updates=5000, lr=0.000894427, gnorm=0.317, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=4616 begin validation on "valid" subset epoch 003 | valid on 'valid' subset | loss 4.104 | nll_loss 2.519 | ppl 5.73 | wps 0 | wpb 44526 | bsz 2008 | num_updates 5000 | best_loss 4.104 epoch 003 | valid on 'valid' subset | loss 4.104 | nll_loss 2.519 | ppl 5.73 | wps 0 | wpb 44526 | bsz 2008 | num_updates 5000 | best_loss 4.104 epoch 003 | valid on 'valid' subset | loss 4.104 | nll_loss 2.519 | ppl 5.73 | wps 0 | wpb 44526 | bsz 2008 | num_updates 5000 | best_loss 4.104 end of epoch 3 (average epoch stats below) epoch 003 | loss 4.253 | nll_loss 2.762 | ppl 6.78 | wps 526900 | ups 1.06 | wpb 495115 | bsz 16502.1 | num_updates 5059 | lr 0.000889196 | gnorm 0.355 | clip 0 | loss_scale 4 | train_wall 1513 | gb_free 23.2 | wall 4710 epoch 003 | loss 4.253 | nll_loss 2.762 | ppl 6.78 | wps 526900 | ups 1.06 | wpb 495115 | bsz 16502.1 | num_updates 5059 | lr 0.000889196 | gnorm 0.355 | clip 0 | loss_scale 4 | train_wall 1513 | gb_free 23.2 | wall 4710 epoch 003 | loss 4.253 | nll_loss 2.762 | ppl 6.78 | wps 526900 | ups 1.06 | wpb 495115 | bsz 16502.1 | num_updates 5059 | lr 0.000889196 | gnorm 0.355 | clip 0 | loss_scale 4 | train_wall 1513 | gb_free 23.2 | wall 4710 Start iterating over samples epoch 004: 41 / 1689 loss=4.146, nll_loss=2.647, ppl=6.26, wps=373444, ups=0.76, wpb=491687, bsz=16828.6, num_updates=5100, lr=0.000885615, gnorm=0.318, clip=0, loss_scale=4, train_wall=105, gb_free=21.7, wall=4748 epoch 004: 41 / 1689 loss=4.146, nll_loss=2.647, ppl=6.26, wps=373444, ups=0.76, wpb=491687, bsz=16828.6, num_updates=5100, lr=0.000885615, gnorm=0.318, clip=0, loss_scale=4, train_wall=105, gb_free=21.7, wall=4748 epoch 004: 41 / 1689 loss=4.146, nll_loss=2.647, ppl=6.26, wps=373444, ups=0.76, wpb=491687, bsz=16828.6, num_updates=5100, lr=0.000885615, gnorm=0.318, clip=0, loss_scale=4, train_wall=105, gb_free=21.7, wall=4748 epoch 004: 41 / 1689 loss=4.146, nll_loss=2.647, ppl=6.26, wps=373444, ups=0.76, wpb=491687, bsz=16828.6, num_updates=5100, lr=0.000885615, gnorm=0.318, clip=0, loss_scale=4, train_wall=105, gb_free=21.7, wall=4748 epoch 004: 141 / 1689 loss=4.137, nll_loss=2.638, ppl=6.22, wps=549532, ups=1.11, wpb=496024, bsz=16396.1, num_updates=5200, lr=0.000877058, gnorm=0.313, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=4838 epoch 004: 141 / 1689 loss=4.137, nll_loss=2.638, ppl=6.22, wps=549532, ups=1.11, wpb=496024, bsz=16396.1, num_updates=5200, lr=0.000877058, gnorm=0.313, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=4838 epoch 004: 141 / 1689 loss=4.137, nll_loss=2.638, ppl=6.22, wps=549532, ups=1.11, wpb=496024, bsz=16396.1, num_updates=5200, lr=0.000877058, gnorm=0.313, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=4838 epoch 004: 141 / 1689 loss=4.137, nll_loss=2.638, ppl=6.22, wps=549532, ups=1.11, wpb=496024, bsz=16396.1, num_updates=5200, lr=0.000877058, gnorm=0.313, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=4838 epoch 004: 242 / 1689 loss=4.127, nll_loss=2.626, ppl=6.17, wps=539949, ups=1.09, wpb=496338, bsz=16307.9, num_updates=5300, lr=0.000868744, gnorm=0.305, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=4930 epoch 004: 242 / 1689 loss=4.127, nll_loss=2.626, ppl=6.17, wps=539949, ups=1.09, wpb=496338, bsz=16307.9, num_updates=5300, lr=0.000868744, gnorm=0.305, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=4930 epoch 004: 242 / 1689 loss=4.127, nll_loss=2.626, ppl=6.17, wps=539949, ups=1.09, wpb=496338, bsz=16307.9, num_updates=5300, lr=0.000868744, gnorm=0.305, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=4930 epoch 004: 242 / 1689 loss=4.127, nll_loss=2.626, ppl=6.17, wps=539949, ups=1.09, wpb=496338, bsz=16307.9, num_updates=5300, lr=0.000868744, gnorm=0.305, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=4930 epoch 004: 342 / 1689 loss=4.118, nll_loss=2.618, ppl=6.14, wps=543228, ups=1.1, wpb=493975, bsz=16735.7, num_updates=5400, lr=0.000860663, gnorm=0.315, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=5021 epoch 004: 342 / 1689 loss=4.118, nll_loss=2.618, ppl=6.14, wps=543228, ups=1.1, wpb=493975, bsz=16735.7, num_updates=5400, lr=0.000860663, gnorm=0.315, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=5021 epoch 004: 342 / 1689 loss=4.118, nll_loss=2.618, ppl=6.14, wps=543228, ups=1.1, wpb=493975, bsz=16735.7, num_updates=5400, lr=0.000860663, gnorm=0.315, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=5021 epoch 004: 342 / 1689 loss=4.118, nll_loss=2.618, ppl=6.14, wps=543228, ups=1.1, wpb=493975, bsz=16735.7, num_updates=5400, lr=0.000860663, gnorm=0.315, clip=0, loss_scale=2, train_wall=90, gb_free=21.8, wall=5021 epoch 004: 442 / 1689 loss=4.104, nll_loss=2.603, ppl=6.07, wps=552132, ups=1.11, wpb=496824, bsz=16678.8, num_updates=5500, lr=0.000852803, gnorm=0.305, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=5111 epoch 004: 442 / 1689 loss=4.104, nll_loss=2.603, ppl=6.07, wps=552132, ups=1.11, wpb=496824, bsz=16678.8, num_updates=5500, lr=0.000852803, gnorm=0.305, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=5111 epoch 004: 442 / 1689 loss=4.104, nll_loss=2.603, ppl=6.07, wps=552132, ups=1.11, wpb=496824, bsz=16678.8, num_updates=5500, lr=0.000852803, gnorm=0.305, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=5111 epoch 004: 442 / 1689 loss=4.104, nll_loss=2.603, ppl=6.07, wps=552132, ups=1.11, wpb=496824, bsz=16678.8, num_updates=5500, lr=0.000852803, gnorm=0.305, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=5111 epoch 004: 542 / 1689 loss=4.115, nll_loss=2.615, ppl=6.13, wps=550290, ups=1.11, wpb=495916, bsz=16423.7, num_updates=5600, lr=0.000845154, gnorm=0.299, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=5201 epoch 004: 542 / 1689 loss=4.115, nll_loss=2.615, ppl=6.13, wps=550290, ups=1.11, wpb=495916, bsz=16423.7, num_updates=5600, lr=0.000845154, gnorm=0.299, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=5201 epoch 004: 542 / 1689 loss=4.115, nll_loss=2.615, ppl=6.13, wps=550290, ups=1.11, wpb=495916, bsz=16423.7, num_updates=5600, lr=0.000845154, gnorm=0.299, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=5201 epoch 004: 542 / 1689 loss=4.115, nll_loss=2.615, ppl=6.13, wps=550290, ups=1.11, wpb=495916, bsz=16423.7, num_updates=5600, lr=0.000845154, gnorm=0.299, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=5201 epoch 004: 642 / 1689 loss=4.1, nll_loss=2.599, ppl=6.06, wps=550809, ups=1.11, wpb=494379, bsz=16563, num_updates=5700, lr=0.000837708, gnorm=0.303, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=5291 epoch 004: 642 / 1689 loss=4.1, nll_loss=2.599, ppl=6.06, wps=550809, ups=1.11, wpb=494379, bsz=16563, num_updates=5700, lr=0.000837708, gnorm=0.303, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=5291 epoch 004: 642 / 1689 loss=4.1, nll_loss=2.599, ppl=6.06, wps=550809, ups=1.11, wpb=494379, bsz=16563, num_updates=5700, lr=0.000837708, gnorm=0.303, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=5291 epoch 004: 642 / 1689 loss=4.1, nll_loss=2.599, ppl=6.06, wps=550809, ups=1.11, wpb=494379, bsz=16563, num_updates=5700, lr=0.000837708, gnorm=0.303, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=5291 epoch 004: 742 / 1689 loss=4.094, nll_loss=2.592, ppl=6.03, wps=551567, ups=1.11, wpb=495835, bsz=16531, num_updates=5800, lr=0.000830455, gnorm=0.293, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=5381 epoch 004: 742 / 1689 loss=4.094, nll_loss=2.592, ppl=6.03, wps=551567, ups=1.11, wpb=495835, bsz=16531, num_updates=5800, lr=0.000830455, gnorm=0.293, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=5381 epoch 004: 742 / 1689 loss=4.094, nll_loss=2.592, ppl=6.03, wps=551567, ups=1.11, wpb=495835, bsz=16531, num_updates=5800, lr=0.000830455, gnorm=0.293, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=5381 epoch 004: 742 / 1689 loss=4.094, nll_loss=2.592, ppl=6.03, wps=551567, ups=1.11, wpb=495835, bsz=16531, num_updates=5800, lr=0.000830455, gnorm=0.293, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=5381 epoch 004: 842 / 1689 loss=4.099, nll_loss=2.598, ppl=6.05, wps=549786, ups=1.11, wpb=495047, bsz=16309.8, num_updates=5900, lr=0.000823387, gnorm=0.292, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=5471 epoch 004: 842 / 1689 loss=4.099, nll_loss=2.598, ppl=6.05, wps=549786, ups=1.11, wpb=495047, bsz=16309.8, num_updates=5900, lr=0.000823387, gnorm=0.292, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=5471 epoch 004: 842 / 1689 loss=4.099, nll_loss=2.598, ppl=6.05, wps=549786, ups=1.11, wpb=495047, bsz=16309.8, num_updates=5900, lr=0.000823387, gnorm=0.292, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=5471 epoch 004: 842 / 1689 loss=4.099, nll_loss=2.598, ppl=6.05, wps=549786, ups=1.11, wpb=495047, bsz=16309.8, num_updates=5900, lr=0.000823387, gnorm=0.292, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=5471 epoch 004: 942 / 1689 loss=4.079, nll_loss=2.576, ppl=5.96, wps=551033, ups=1.11, wpb=495867, bsz=16303.1, num_updates=6000, lr=0.000816497, gnorm=0.301, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=5561 epoch 004: 942 / 1689 loss=4.079, nll_loss=2.576, ppl=5.96, wps=551033, ups=1.11, wpb=495867, bsz=16303.1, num_updates=6000, lr=0.000816497, gnorm=0.301, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=5561 epoch 004: 942 / 1689 loss=4.079, nll_loss=2.576, ppl=5.96, wps=551033, ups=1.11, wpb=495867, bsz=16303.1, num_updates=6000, lr=0.000816497, gnorm=0.301, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=5561 epoch 004: 942 / 1689 loss=4.079, nll_loss=2.576, ppl=5.96, wps=551033, ups=1.11, wpb=495867, bsz=16303.1, num_updates=6000, lr=0.000816497, gnorm=0.301, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=5561 begin validation on "valid" subset epoch 004 | valid on 'valid' subset | loss 4.019 | nll_loss 2.432 | ppl 5.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 6000 | best_loss 4.019 epoch 004 | valid on 'valid' subset | loss 4.019 | nll_loss 2.432 | ppl 5.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 6000 | best_loss 4.019 epoch 004 | valid on 'valid' subset | loss 4.019 | nll_loss 2.432 | ppl 5.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 6000 | best_loss 4.019 epoch 004 | valid on 'valid' subset | loss 4.019 | nll_loss 2.432 | ppl 5.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 6000 | best_loss 4.019 epoch 004: 1042 / 1689 loss=4.072, nll_loss=2.569, ppl=5.93, wps=423538, ups=0.86, wpb=495105, bsz=16619.3, num_updates=6100, lr=0.000809776, gnorm=0.285, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=5678 epoch 004: 1042 / 1689 loss=4.072, nll_loss=2.569, ppl=5.93, wps=423538, ups=0.86, wpb=495105, bsz=16619.3, num_updates=6100, lr=0.000809776, gnorm=0.285, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=5678 epoch 004: 1042 / 1689 loss=4.072, nll_loss=2.569, ppl=5.93, wps=423538, ups=0.86, wpb=495105, bsz=16619.3, num_updates=6100, lr=0.000809776, gnorm=0.285, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=5678 epoch 004: 1042 / 1689 loss=4.072, nll_loss=2.569, ppl=5.93, wps=423538, ups=0.86, wpb=495105, bsz=16619.3, num_updates=6100, lr=0.000809776, gnorm=0.285, clip=0, loss_scale=4, train_wall=89, gb_free=20.9, wall=5678 epoch 004: 1142 / 1689 loss=4.072, nll_loss=2.569, ppl=5.94, wps=553965, ups=1.12, wpb=494956, bsz=16745.6, num_updates=6200, lr=0.000803219, gnorm=0.286, clip=0, loss_scale=4, train_wall=89, gb_free=20.4, wall=5767 epoch 004: 1142 / 1689 loss=4.072, nll_loss=2.569, ppl=5.94, wps=553965, ups=1.12, wpb=494956, bsz=16745.6, num_updates=6200, lr=0.000803219, gnorm=0.286, clip=0, loss_scale=4, train_wall=89, gb_free=20.4, wall=5767 epoch 004: 1142 / 1689 loss=4.072, nll_loss=2.569, ppl=5.94, wps=553965, ups=1.12, wpb=494956, bsz=16745.6, num_updates=6200, lr=0.000803219, gnorm=0.286, clip=0, loss_scale=4, train_wall=89, gb_free=20.4, wall=5767 epoch 004: 1142 / 1689 loss=4.072, nll_loss=2.569, ppl=5.94, wps=553965, ups=1.12, wpb=494956, bsz=16745.6, num_updates=6200, lr=0.000803219, gnorm=0.286, clip=0, loss_scale=4, train_wall=89, gb_free=20.4, wall=5767 epoch 004: 1242 / 1689 loss=4.066, nll_loss=2.563, ppl=5.91, wps=551410, ups=1.11, wpb=494927, bsz=16264.9, num_updates=6300, lr=0.000796819, gnorm=0.282, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=5857 epoch 004: 1242 / 1689 loss=4.066, nll_loss=2.563, ppl=5.91, wps=551410, ups=1.11, wpb=494927, bsz=16264.9, num_updates=6300, lr=0.000796819, gnorm=0.282, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=5857 epoch 004: 1242 / 1689 loss=4.066, nll_loss=2.563, ppl=5.91, wps=551410, ups=1.11, wpb=494927, bsz=16264.9, num_updates=6300, lr=0.000796819, gnorm=0.282, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=5857 epoch 004: 1242 / 1689 loss=4.066, nll_loss=2.563, ppl=5.91, wps=551410, ups=1.11, wpb=494927, bsz=16264.9, num_updates=6300, lr=0.000796819, gnorm=0.282, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=5857 epoch 004: 1343 / 1689 loss=4.062, nll_loss=2.558, ppl=5.89, wps=541800, ups=1.1, wpb=494640, bsz=16639.3, num_updates=6400, lr=0.000790569, gnorm=0.277, clip=0, loss_scale=4, train_wall=91, gb_free=21.6, wall=5948 epoch 004: 1343 / 1689 loss=4.062, nll_loss=2.558, ppl=5.89, wps=541800, ups=1.1, wpb=494640, bsz=16639.3, num_updates=6400, lr=0.000790569, gnorm=0.277, clip=0, loss_scale=4, train_wall=91, gb_free=21.6, wall=5948 epoch 004: 1343 / 1689 loss=4.062, nll_loss=2.558, ppl=5.89, wps=541800, ups=1.1, wpb=494640, bsz=16639.3, num_updates=6400, lr=0.000790569, gnorm=0.277, clip=0, loss_scale=4, train_wall=91, gb_free=21.6, wall=5948 epoch 004: 1343 / 1689 loss=4.062, nll_loss=2.558, ppl=5.89, wps=541800, ups=1.1, wpb=494640, bsz=16639.3, num_updates=6400, lr=0.000790569, gnorm=0.277, clip=0, loss_scale=4, train_wall=91, gb_free=21.6, wall=5948 epoch 004: 1443 / 1689 loss=4.055, nll_loss=2.551, ppl=5.86, wps=545970, ups=1.1, wpb=494315, bsz=16775.5, num_updates=6500, lr=0.000784465, gnorm=0.296, clip=0, loss_scale=4, train_wall=90, gb_free=22.6, wall=6039 epoch 004: 1443 / 1689 loss=4.055, nll_loss=2.551, ppl=5.86, wps=545970, ups=1.1, wpb=494315, bsz=16775.5, num_updates=6500, lr=0.000784465, gnorm=0.296, clip=0, loss_scale=4, train_wall=90, gb_free=22.6, wall=6039 epoch 004: 1443 / 1689 loss=4.055, nll_loss=2.551, ppl=5.86, wps=545970, ups=1.1, wpb=494315, bsz=16775.5, num_updates=6500, lr=0.000784465, gnorm=0.296, clip=0, loss_scale=4, train_wall=90, gb_free=22.6, wall=6039 epoch 004: 1443 / 1689 loss=4.055, nll_loss=2.551, ppl=5.86, wps=545970, ups=1.1, wpb=494315, bsz=16775.5, num_updates=6500, lr=0.000784465, gnorm=0.296, clip=0, loss_scale=4, train_wall=90, gb_free=22.6, wall=6039 epoch 004: 1543 / 1689 loss=4.051, nll_loss=2.546, ppl=5.84, wps=546926, ups=1.11, wpb=494708, bsz=16402, num_updates=6600, lr=0.000778499, gnorm=0.282, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=6129 epoch 004: 1543 / 1689 loss=4.051, nll_loss=2.546, ppl=5.84, wps=546926, ups=1.11, wpb=494708, bsz=16402, num_updates=6600, lr=0.000778499, gnorm=0.282, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=6129 epoch 004: 1543 / 1689 loss=4.051, nll_loss=2.546, ppl=5.84, wps=546926, ups=1.11, wpb=494708, bsz=16402, num_updates=6600, lr=0.000778499, gnorm=0.282, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=6129 epoch 004: 1543 / 1689 loss=4.051, nll_loss=2.546, ppl=5.84, wps=546926, ups=1.11, wpb=494708, bsz=16402, num_updates=6600, lr=0.000778499, gnorm=0.282, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=6129 epoch 004: 1643 / 1689 loss=4.039, nll_loss=2.533, ppl=5.79, wps=553321, ups=1.11, wpb=496980, bsz=16207, num_updates=6700, lr=0.000772667, gnorm=0.267, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=6219 epoch 004: 1643 / 1689 loss=4.039, nll_loss=2.533, ppl=5.79, wps=553321, ups=1.11, wpb=496980, bsz=16207, num_updates=6700, lr=0.000772667, gnorm=0.267, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=6219 epoch 004: 1643 / 1689 loss=4.039, nll_loss=2.533, ppl=5.79, wps=553321, ups=1.11, wpb=496980, bsz=16207, num_updates=6700, lr=0.000772667, gnorm=0.267, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=6219 epoch 004: 1643 / 1689 loss=4.039, nll_loss=2.533, ppl=5.79, wps=553321, ups=1.11, wpb=496980, bsz=16207, num_updates=6700, lr=0.000772667, gnorm=0.267, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=6219 end of epoch 4 (average epoch stats below) epoch 004 | loss 4.087 | nll_loss 2.585 | ppl 6 | wps 539022 | ups 1.09 | wpb 495127 | bsz 16501.5 | num_updates 6746 | lr 0.000770029 | gnorm 0.294 | clip 0 | loss_scale 4 | train_wall 1500 | gb_free 23.3 | wall 6260 epoch 004 | loss 4.087 | nll_loss 2.585 | ppl 6 | wps 539022 | ups 1.09 | wpb 495127 | bsz 16501.5 | num_updates 6746 | lr 0.000770029 | gnorm 0.294 | clip 0 | loss_scale 4 | train_wall 1500 | gb_free 23.3 | wall 6260 epoch 004 | loss 4.087 | nll_loss 2.585 | ppl 6 | wps 539022 | ups 1.09 | wpb 495127 | bsz 16501.5 | num_updates 6746 | lr 0.000770029 | gnorm 0.294 | clip 0 | loss_scale 4 | train_wall 1500 | gb_free 23.3 | wall 6260 epoch 004 | loss 4.087 | nll_loss 2.585 | ppl 6 | wps 539022 | ups 1.09 | wpb 495127 | bsz 16501.5 | num_updates 6746 | lr 0.000770029 | gnorm 0.294 | clip 0 | loss_scale 4 | train_wall 1500 | gb_free 23.3 | wall 6260 Start iterating over samples epoch 005: 54 / 1689 loss=4.034, nll_loss=2.527, ppl=5.76, wps=541229, ups=1.1, wpb=491068, bsz=16227.2, num_updates=6800, lr=0.000766965, gnorm=0.284, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=6310 epoch 005: 54 / 1689 loss=4.034, nll_loss=2.527, ppl=5.76, wps=541229, ups=1.1, wpb=491068, bsz=16227.2, num_updates=6800, lr=0.000766965, gnorm=0.284, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=6310 epoch 005: 54 / 1689 loss=4.034, nll_loss=2.527, ppl=5.76, wps=541229, ups=1.1, wpb=491068, bsz=16227.2, num_updates=6800, lr=0.000766965, gnorm=0.284, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=6310 epoch 005: 54 / 1689 loss=4.034, nll_loss=2.527, ppl=5.76, wps=541229, ups=1.1, wpb=491068, bsz=16227.2, num_updates=6800, lr=0.000766965, gnorm=0.284, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=6310 epoch 005: 54 / 1689 loss=4.034, nll_loss=2.527, ppl=5.76, wps=541229, ups=1.1, wpb=491068, bsz=16227.2, num_updates=6800, lr=0.000766965, gnorm=0.284, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=6310 epoch 005: 154 / 1689 loss=4.018, nll_loss=2.51, ppl=5.69, wps=551774, ups=1.11, wpb=494999, bsz=16482.5, num_updates=6900, lr=0.000761387, gnorm=0.272, clip=0, loss_scale=8, train_wall=88, gb_free=22.1, wall=6400 epoch 005: 154 / 1689 loss=4.018, nll_loss=2.51, ppl=5.69, wps=551774, ups=1.11, wpb=494999, bsz=16482.5, num_updates=6900, lr=0.000761387, gnorm=0.272, clip=0, loss_scale=8, train_wall=88, gb_free=22.1, wall=6400 epoch 005: 154 / 1689 loss=4.018, nll_loss=2.51, ppl=5.69, wps=551774, ups=1.11, wpb=494999, bsz=16482.5, num_updates=6900, lr=0.000761387, gnorm=0.272, clip=0, loss_scale=8, train_wall=88, gb_free=22.1, wall=6400 epoch 005: 154 / 1689 loss=4.018, nll_loss=2.51, ppl=5.69, wps=551774, ups=1.11, wpb=494999, bsz=16482.5, num_updates=6900, lr=0.000761387, gnorm=0.272, clip=0, loss_scale=8, train_wall=88, gb_free=22.1, wall=6400 epoch 005: 154 / 1689 loss=4.018, nll_loss=2.51, ppl=5.69, wps=551774, ups=1.11, wpb=494999, bsz=16482.5, num_updates=6900, lr=0.000761387, gnorm=0.272, clip=0, loss_scale=8, train_wall=88, gb_free=22.1, wall=6400 epoch 005: 254 / 1689 loss=4.012, nll_loss=2.502, ppl=5.67, wps=546876, ups=1.11, wpb=494600, bsz=16395, num_updates=7000, lr=0.000755929, gnorm=0.284, clip=0, loss_scale=8, train_wall=89, gb_free=20.1, wall=6490 epoch 005: 254 / 1689 loss=4.012, nll_loss=2.502, ppl=5.67, wps=546876, ups=1.11, wpb=494600, bsz=16395, num_updates=7000, lr=0.000755929, gnorm=0.284, clip=0, loss_scale=8, train_wall=89, gb_free=20.1, wall=6490 epoch 005: 254 / 1689 loss=4.012, nll_loss=2.502, ppl=5.67, wps=546876, ups=1.11, wpb=494600, bsz=16395, num_updates=7000, lr=0.000755929, gnorm=0.284, clip=0, loss_scale=8, train_wall=89, gb_free=20.1, wall=6490 epoch 005: 254 / 1689 loss=4.012, nll_loss=2.502, ppl=5.67, wps=546876, ups=1.11, wpb=494600, bsz=16395, num_updates=7000, lr=0.000755929, gnorm=0.284, clip=0, loss_scale=8, train_wall=89, gb_free=20.1, wall=6490 epoch 005: 254 / 1689 loss=4.012, nll_loss=2.502, ppl=5.67, wps=546876, ups=1.11, wpb=494600, bsz=16395, num_updates=7000, lr=0.000755929, gnorm=0.284, clip=0, loss_scale=8, train_wall=89, gb_free=20.1, wall=6490 begin validation on "valid" subset epoch 005 | valid on 'valid' subset | loss 3.972 | nll_loss 2.393 | ppl 5.25 | wps 0 | wpb 44526 | bsz 2008 | num_updates 7000 | best_loss 3.972 epoch 005 | valid on 'valid' subset | loss 3.972 | nll_loss 2.393 | ppl 5.25 | wps 0 | wpb 44526 | bsz 2008 | num_updates 7000 | best_loss 3.972 epoch 005 | valid on 'valid' subset | loss 3.972 | nll_loss 2.393 | ppl 5.25 | wps 0 | wpb 44526 | bsz 2008 | num_updates 7000 | best_loss 3.972 epoch 005 | valid on 'valid' subset | loss 3.972 | nll_loss 2.393 | ppl 5.25 | wps 0 | wpb 44526 | bsz 2008 | num_updates 7000 | best_loss 3.972 epoch 005 | valid on 'valid' subset | loss 3.972 | nll_loss 2.393 | ppl 5.25 | wps 0 | wpb 44526 | bsz 2008 | num_updates 7000 | best_loss 3.972 epoch 005: 355 / 1689 loss=4.019, nll_loss=2.511, ppl=5.7, wps=451045, ups=0.91, wpb=494943, bsz=16471.8, num_updates=7100, lr=0.000750587, gnorm=0.282, clip=0, loss_scale=4, train_wall=90, gb_free=21.8, wall=6600 epoch 005: 355 / 1689 loss=4.019, nll_loss=2.511, ppl=5.7, wps=451045, ups=0.91, wpb=494943, bsz=16471.8, num_updates=7100, lr=0.000750587, gnorm=0.282, clip=0, loss_scale=4, train_wall=90, gb_free=21.8, wall=6600 epoch 005: 355 / 1689 loss=4.019, nll_loss=2.511, ppl=5.7, wps=451045, ups=0.91, wpb=494943, bsz=16471.8, num_updates=7100, lr=0.000750587, gnorm=0.282, clip=0, loss_scale=4, train_wall=90, gb_free=21.8, wall=6600 epoch 005: 355 / 1689 loss=4.019, nll_loss=2.511, ppl=5.7, wps=451045, ups=0.91, wpb=494943, bsz=16471.8, num_updates=7100, lr=0.000750587, gnorm=0.282, clip=0, loss_scale=4, train_wall=90, gb_free=21.8, wall=6600 epoch 005: 355 / 1689 loss=4.019, nll_loss=2.511, ppl=5.7, wps=451045, ups=0.91, wpb=494943, bsz=16471.8, num_updates=7100, lr=0.000750587, gnorm=0.282, clip=0, loss_scale=4, train_wall=90, gb_free=21.8, wall=6600 epoch 005: 455 / 1689 loss=4.005, nll_loss=2.496, ppl=5.64, wps=549319, ups=1.11, wpb=496196, bsz=16668.8, num_updates=7200, lr=0.000745356, gnorm=0.258, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=6690 epoch 005: 455 / 1689 loss=4.005, nll_loss=2.496, ppl=5.64, wps=549319, ups=1.11, wpb=496196, bsz=16668.8, num_updates=7200, lr=0.000745356, gnorm=0.258, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=6690 epoch 005: 455 / 1689 loss=4.005, nll_loss=2.496, ppl=5.64, wps=549319, ups=1.11, wpb=496196, bsz=16668.8, num_updates=7200, lr=0.000745356, gnorm=0.258, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=6690 epoch 005: 455 / 1689 loss=4.005, nll_loss=2.496, ppl=5.64, wps=549319, ups=1.11, wpb=496196, bsz=16668.8, num_updates=7200, lr=0.000745356, gnorm=0.258, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=6690 epoch 005: 455 / 1689 loss=4.005, nll_loss=2.496, ppl=5.64, wps=549319, ups=1.11, wpb=496196, bsz=16668.8, num_updates=7200, lr=0.000745356, gnorm=0.258, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=6690 epoch 005: 555 / 1689 loss=4.003, nll_loss=2.494, ppl=5.63, wps=551372, ups=1.11, wpb=495952, bsz=16605.6, num_updates=7300, lr=0.000740233, gnorm=0.267, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=6780 epoch 005: 555 / 1689 loss=4.003, nll_loss=2.494, ppl=5.63, wps=551372, ups=1.11, wpb=495952, bsz=16605.6, num_updates=7300, lr=0.000740233, gnorm=0.267, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=6780 epoch 005: 555 / 1689 loss=4.003, nll_loss=2.494, ppl=5.63, wps=551372, ups=1.11, wpb=495952, bsz=16605.6, num_updates=7300, lr=0.000740233, gnorm=0.267, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=6780 epoch 005: 555 / 1689 loss=4.003, nll_loss=2.494, ppl=5.63, wps=551372, ups=1.11, wpb=495952, bsz=16605.6, num_updates=7300, lr=0.000740233, gnorm=0.267, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=6780 epoch 005: 555 / 1689 loss=4.003, nll_loss=2.494, ppl=5.63, wps=551372, ups=1.11, wpb=495952, bsz=16605.6, num_updates=7300, lr=0.000740233, gnorm=0.267, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=6780 epoch 005: 655 / 1689 loss=4.007, nll_loss=2.498, ppl=5.65, wps=551304, ups=1.11, wpb=495698, bsz=16362.2, num_updates=7400, lr=0.000735215, gnorm=0.273, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=6870 epoch 005: 655 / 1689 loss=4.007, nll_loss=2.498, ppl=5.65, wps=551304, ups=1.11, wpb=495698, bsz=16362.2, num_updates=7400, lr=0.000735215, gnorm=0.273, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=6870 epoch 005: 655 / 1689 loss=4.007, nll_loss=2.498, ppl=5.65, wps=551304, ups=1.11, wpb=495698, bsz=16362.2, num_updates=7400, lr=0.000735215, gnorm=0.273, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=6870 epoch 005: 655 / 1689 loss=4.007, nll_loss=2.498, ppl=5.65, wps=551304, ups=1.11, wpb=495698, bsz=16362.2, num_updates=7400, lr=0.000735215, gnorm=0.273, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=6870 epoch 005: 655 / 1689 loss=4.007, nll_loss=2.498, ppl=5.65, wps=551304, ups=1.11, wpb=495698, bsz=16362.2, num_updates=7400, lr=0.000735215, gnorm=0.273, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=6870 epoch 005: 755 / 1689 loss=4.01, nll_loss=2.502, ppl=5.66, wps=553241, ups=1.12, wpb=494323, bsz=16388.2, num_updates=7500, lr=0.000730297, gnorm=0.283, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=6959 epoch 005: 755 / 1689 loss=4.01, nll_loss=2.502, ppl=5.66, wps=553241, ups=1.12, wpb=494323, bsz=16388.2, num_updates=7500, lr=0.000730297, gnorm=0.283, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=6959 epoch 005: 755 / 1689 loss=4.01, nll_loss=2.502, ppl=5.66, wps=553241, ups=1.12, wpb=494323, bsz=16388.2, num_updates=7500, lr=0.000730297, gnorm=0.283, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=6959 epoch 005: 755 / 1689 loss=4.01, nll_loss=2.502, ppl=5.66, wps=553241, ups=1.12, wpb=494323, bsz=16388.2, num_updates=7500, lr=0.000730297, gnorm=0.283, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=6959 epoch 005: 755 / 1689 loss=4.01, nll_loss=2.502, ppl=5.66, wps=553241, ups=1.12, wpb=494323, bsz=16388.2, num_updates=7500, lr=0.000730297, gnorm=0.283, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=6959 epoch 005: 856 / 1689 loss=4.001, nll_loss=2.492, ppl=5.63, wps=544928, ups=1.1, wpb=495387, bsz=16593, num_updates=7600, lr=0.000725476, gnorm=0.266, clip=0, loss_scale=4, train_wall=90, gb_free=21.4, wall=7050 epoch 005: 856 / 1689 loss=4.001, nll_loss=2.492, ppl=5.63, wps=544928, ups=1.1, wpb=495387, bsz=16593, num_updates=7600, lr=0.000725476, gnorm=0.266, clip=0, loss_scale=4, train_wall=90, gb_free=21.4, wall=7050 epoch 005: 856 / 1689 loss=4.001, nll_loss=2.492, ppl=5.63, wps=544928, ups=1.1, wpb=495387, bsz=16593, num_updates=7600, lr=0.000725476, gnorm=0.266, clip=0, loss_scale=4, train_wall=90, gb_free=21.4, wall=7050 epoch 005: 856 / 1689 loss=4.001, nll_loss=2.492, ppl=5.63, wps=544928, ups=1.1, wpb=495387, bsz=16593, num_updates=7600, lr=0.000725476, gnorm=0.266, clip=0, loss_scale=4, train_wall=90, gb_free=21.4, wall=7050 epoch 005: 856 / 1689 loss=4.001, nll_loss=2.492, ppl=5.63, wps=544928, ups=1.1, wpb=495387, bsz=16593, num_updates=7600, lr=0.000725476, gnorm=0.266, clip=0, loss_scale=4, train_wall=90, gb_free=21.4, wall=7050 epoch 005: 956 / 1689 loss=4.006, nll_loss=2.498, ppl=5.65, wps=548453, ups=1.11, wpb=495898, bsz=16465, num_updates=7700, lr=0.00072075, gnorm=0.265, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=7141 epoch 005: 956 / 1689 loss=4.006, nll_loss=2.498, ppl=5.65, wps=548453, ups=1.11, wpb=495898, bsz=16465, num_updates=7700, lr=0.00072075, gnorm=0.265, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=7141 epoch 005: 956 / 1689 loss=4.006, nll_loss=2.498, ppl=5.65, wps=548453, ups=1.11, wpb=495898, bsz=16465, num_updates=7700, lr=0.00072075, gnorm=0.265, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=7141 epoch 005: 956 / 1689 loss=4.006, nll_loss=2.498, ppl=5.65, wps=548453, ups=1.11, wpb=495898, bsz=16465, num_updates=7700, lr=0.00072075, gnorm=0.265, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=7141 epoch 005: 956 / 1689 loss=4.006, nll_loss=2.498, ppl=5.65, wps=548453, ups=1.11, wpb=495898, bsz=16465, num_updates=7700, lr=0.00072075, gnorm=0.265, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=7141 epoch 005: 1056 / 1689 loss=3.991, nll_loss=2.482, ppl=5.59, wps=547580, ups=1.1, wpb=497649, bsz=16667.8, num_updates=7800, lr=0.000716115, gnorm=0.264, clip=0, loss_scale=4, train_wall=89, gb_free=22.5, wall=7232 epoch 005: 1056 / 1689 loss=3.991, nll_loss=2.482, ppl=5.59, wps=547580, ups=1.1, wpb=497649, bsz=16667.8, num_updates=7800, lr=0.000716115, gnorm=0.264, clip=0, loss_scale=4, train_wall=89, gb_free=22.5, wall=7232 epoch 005: 1056 / 1689 loss=3.991, nll_loss=2.482, ppl=5.59, wps=547580, ups=1.1, wpb=497649, bsz=16667.8, num_updates=7800, lr=0.000716115, gnorm=0.264, clip=0, loss_scale=4, train_wall=89, gb_free=22.5, wall=7232 epoch 005: 1056 / 1689 loss=3.991, nll_loss=2.482, ppl=5.59, wps=547580, ups=1.1, wpb=497649, bsz=16667.8, num_updates=7800, lr=0.000716115, gnorm=0.264, clip=0, loss_scale=4, train_wall=89, gb_free=22.5, wall=7232 epoch 005: 1056 / 1689 loss=3.991, nll_loss=2.482, ppl=5.59, wps=547580, ups=1.1, wpb=497649, bsz=16667.8, num_updates=7800, lr=0.000716115, gnorm=0.264, clip=0, loss_scale=4, train_wall=89, gb_free=22.5, wall=7232 epoch 005: 1156 / 1689 loss=3.986, nll_loss=2.476, ppl=5.56, wps=545416, ups=1.1, wpb=494045, bsz=16677.1, num_updates=7900, lr=0.000711568, gnorm=0.267, clip=0, loss_scale=4, train_wall=89, gb_free=22.7, wall=7322 epoch 005: 1156 / 1689 loss=3.986, nll_loss=2.476, ppl=5.56, wps=545416, ups=1.1, wpb=494045, bsz=16677.1, num_updates=7900, lr=0.000711568, gnorm=0.267, clip=0, loss_scale=4, train_wall=89, gb_free=22.7, wall=7322 epoch 005: 1156 / 1689 loss=3.986, nll_loss=2.476, ppl=5.56, wps=545416, ups=1.1, wpb=494045, bsz=16677.1, num_updates=7900, lr=0.000711568, gnorm=0.267, clip=0, loss_scale=4, train_wall=89, gb_free=22.7, wall=7322 epoch 005: 1156 / 1689 loss=3.986, nll_loss=2.476, ppl=5.56, wps=545416, ups=1.1, wpb=494045, bsz=16677.1, num_updates=7900, lr=0.000711568, gnorm=0.267, clip=0, loss_scale=4, train_wall=89, gb_free=22.7, wall=7322 epoch 005: 1156 / 1689 loss=3.986, nll_loss=2.476, ppl=5.56, wps=545416, ups=1.1, wpb=494045, bsz=16677.1, num_updates=7900, lr=0.000711568, gnorm=0.267, clip=0, loss_scale=4, train_wall=89, gb_free=22.7, wall=7322 epoch 005: 1257 / 1689 loss=3.988, nll_loss=2.478, ppl=5.57, wps=539864, ups=1.09, wpb=494066, bsz=16369, num_updates=8000, lr=0.000707107, gnorm=0.265, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=7414 epoch 005: 1257 / 1689 loss=3.988, nll_loss=2.478, ppl=5.57, wps=539864, ups=1.09, wpb=494066, bsz=16369, num_updates=8000, lr=0.000707107, gnorm=0.265, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=7414 epoch 005: 1257 / 1689 loss=3.988, nll_loss=2.478, ppl=5.57, wps=539864, ups=1.09, wpb=494066, bsz=16369, num_updates=8000, lr=0.000707107, gnorm=0.265, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=7414 epoch 005: 1257 / 1689 loss=3.988, nll_loss=2.478, ppl=5.57, wps=539864, ups=1.09, wpb=494066, bsz=16369, num_updates=8000, lr=0.000707107, gnorm=0.265, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=7414 epoch 005: 1257 / 1689 loss=3.988, nll_loss=2.478, ppl=5.57, wps=539864, ups=1.09, wpb=494066, bsz=16369, num_updates=8000, lr=0.000707107, gnorm=0.265, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=7414 begin validation on "valid" subset epoch 005 | valid on 'valid' subset | loss 3.931 | nll_loss 2.354 | ppl 5.11 | wps 0 | wpb 44526 | bsz 2008 | num_updates 8000 | best_loss 3.931 epoch 005 | valid on 'valid' subset | loss 3.931 | nll_loss 2.354 | ppl 5.11 | wps 0 | wpb 44526 | bsz 2008 | num_updates 8000 | best_loss 3.931 epoch 005 | valid on 'valid' subset | loss 3.931 | nll_loss 2.354 | ppl 5.11 | wps 0 | wpb 44526 | bsz 2008 | num_updates 8000 | best_loss 3.931 epoch 005 | valid on 'valid' subset | loss 3.931 | nll_loss 2.354 | ppl 5.11 | wps 0 | wpb 44526 | bsz 2008 | num_updates 8000 | best_loss 3.931 epoch 005 | valid on 'valid' subset | loss 3.931 | nll_loss 2.354 | ppl 5.11 | wps 0 | wpb 44526 | bsz 2008 | num_updates 8000 | best_loss 3.931 epoch 005: 1357 / 1689 loss=3.985, nll_loss=2.475, ppl=5.56, wps=447030, ups=0.9, wpb=494962, bsz=16785.4, num_updates=8100, lr=0.000702728, gnorm=0.27, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=7524 epoch 005: 1357 / 1689 loss=3.985, nll_loss=2.475, ppl=5.56, wps=447030, ups=0.9, wpb=494962, bsz=16785.4, num_updates=8100, lr=0.000702728, gnorm=0.27, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=7524 epoch 005: 1357 / 1689 loss=3.985, nll_loss=2.475, ppl=5.56, wps=447030, ups=0.9, wpb=494962, bsz=16785.4, num_updates=8100, lr=0.000702728, gnorm=0.27, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=7524 epoch 005: 1357 / 1689 loss=3.985, nll_loss=2.475, ppl=5.56, wps=447030, ups=0.9, wpb=494962, bsz=16785.4, num_updates=8100, lr=0.000702728, gnorm=0.27, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=7524 epoch 005: 1357 / 1689 loss=3.985, nll_loss=2.475, ppl=5.56, wps=447030, ups=0.9, wpb=494962, bsz=16785.4, num_updates=8100, lr=0.000702728, gnorm=0.27, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=7524 epoch 005: 1457 / 1689 loss=3.979, nll_loss=2.469, ppl=5.54, wps=547986, ups=1.11, wpb=495674, bsz=16477.2, num_updates=8200, lr=0.00069843, gnorm=0.256, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=7615 epoch 005: 1457 / 1689 loss=3.979, nll_loss=2.469, ppl=5.54, wps=547986, ups=1.11, wpb=495674, bsz=16477.2, num_updates=8200, lr=0.00069843, gnorm=0.256, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=7615 epoch 005: 1457 / 1689 loss=3.979, nll_loss=2.469, ppl=5.54, wps=547986, ups=1.11, wpb=495674, bsz=16477.2, num_updates=8200, lr=0.00069843, gnorm=0.256, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=7615 epoch 005: 1457 / 1689 loss=3.979, nll_loss=2.469, ppl=5.54, wps=547986, ups=1.11, wpb=495674, bsz=16477.2, num_updates=8200, lr=0.00069843, gnorm=0.256, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=7615 epoch 005: 1457 / 1689 loss=3.979, nll_loss=2.469, ppl=5.54, wps=547986, ups=1.11, wpb=495674, bsz=16477.2, num_updates=8200, lr=0.00069843, gnorm=0.256, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=7615 epoch 005: 1557 / 1689 loss=3.979, nll_loss=2.469, ppl=5.54, wps=554946, ups=1.12, wpb=495932, bsz=16143.1, num_updates=8300, lr=0.00069421, gnorm=0.249, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=7704 epoch 005: 1557 / 1689 loss=3.979, nll_loss=2.469, ppl=5.54, wps=554946, ups=1.12, wpb=495932, bsz=16143.1, num_updates=8300, lr=0.00069421, gnorm=0.249, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=7704 epoch 005: 1557 / 1689 loss=3.979, nll_loss=2.469, ppl=5.54, wps=554946, ups=1.12, wpb=495932, bsz=16143.1, num_updates=8300, lr=0.00069421, gnorm=0.249, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=7704 epoch 005: 1557 / 1689 loss=3.979, nll_loss=2.469, ppl=5.54, wps=554946, ups=1.12, wpb=495932, bsz=16143.1, num_updates=8300, lr=0.00069421, gnorm=0.249, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=7704 epoch 005: 1557 / 1689 loss=3.979, nll_loss=2.469, ppl=5.54, wps=554946, ups=1.12, wpb=495932, bsz=16143.1, num_updates=8300, lr=0.00069421, gnorm=0.249, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=7704 epoch 005: 1657 / 1689 loss=3.975, nll_loss=2.465, ppl=5.52, wps=548215, ups=1.11, wpb=495486, bsz=16830.4, num_updates=8400, lr=0.000690066, gnorm=0.269, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=7795 epoch 005: 1657 / 1689 loss=3.975, nll_loss=2.465, ppl=5.52, wps=548215, ups=1.11, wpb=495486, bsz=16830.4, num_updates=8400, lr=0.000690066, gnorm=0.269, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=7795 epoch 005: 1657 / 1689 loss=3.975, nll_loss=2.465, ppl=5.52, wps=548215, ups=1.11, wpb=495486, bsz=16830.4, num_updates=8400, lr=0.000690066, gnorm=0.269, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=7795 epoch 005: 1657 / 1689 loss=3.975, nll_loss=2.465, ppl=5.52, wps=548215, ups=1.11, wpb=495486, bsz=16830.4, num_updates=8400, lr=0.000690066, gnorm=0.269, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=7795 epoch 005: 1657 / 1689 loss=3.975, nll_loss=2.465, ppl=5.52, wps=548215, ups=1.11, wpb=495486, bsz=16830.4, num_updates=8400, lr=0.000690066, gnorm=0.269, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=7795 end of epoch 5 (average epoch stats below) epoch 005 | loss 3.998 | nll_loss 2.489 | ppl 5.61 | wps 534266 | ups 1.08 | wpb 495116 | bsz 16504.6 | num_updates 8432 | lr 0.000688755 | gnorm 0.268 | clip 0 | loss_scale 2 | train_wall 1499 | gb_free 22.3 | wall 7822 epoch 005 | loss 3.998 | nll_loss 2.489 | ppl 5.61 | wps 534266 | ups 1.08 | wpb 495116 | bsz 16504.6 | num_updates 8432 | lr 0.000688755 | gnorm 0.268 | clip 0 | loss_scale 2 | train_wall 1499 | gb_free 22.3 | wall 7822 epoch 005 | loss 3.998 | nll_loss 2.489 | ppl 5.61 | wps 534266 | ups 1.08 | wpb 495116 | bsz 16504.6 | num_updates 8432 | lr 0.000688755 | gnorm 0.268 | clip 0 | loss_scale 2 | train_wall 1499 | gb_free 22.3 | wall 7822 epoch 005 | loss 3.998 | nll_loss 2.489 | ppl 5.61 | wps 534266 | ups 1.08 | wpb 495116 | bsz 16504.6 | num_updates 8432 | lr 0.000688755 | gnorm 0.268 | clip 0 | loss_scale 2 | train_wall 1499 | gb_free 22.3 | wall 7822 epoch 005 | loss 3.998 | nll_loss 2.489 | ppl 5.61 | wps 534266 | ups 1.08 | wpb 495116 | bsz 16504.6 | num_updates 8432 | lr 0.000688755 | gnorm 0.268 | clip 0 | loss_scale 2 | train_wall 1499 | gb_free 22.3 | wall 7822 Start iterating over samples epoch 006: 68 / 1689 loss=3.955, nll_loss=2.441, ppl=5.43, wps=533329, ups=1.08, wpb=492620, bsz=16013.7, num_updates=8500, lr=0.000685994, gnorm=0.258, clip=0, loss_scale=4, train_wall=91, gb_free=21.2, wall=7887 epoch 006: 68 / 1689 loss=3.955, nll_loss=2.441, ppl=5.43, wps=533329, ups=1.08, wpb=492620, bsz=16013.7, num_updates=8500, lr=0.000685994, gnorm=0.258, clip=0, loss_scale=4, train_wall=91, gb_free=21.2, wall=7887 epoch 006: 68 / 1689 loss=3.955, nll_loss=2.441, ppl=5.43, wps=533329, ups=1.08, wpb=492620, bsz=16013.7, num_updates=8500, lr=0.000685994, gnorm=0.258, clip=0, loss_scale=4, train_wall=91, gb_free=21.2, wall=7887 epoch 006: 68 / 1689 loss=3.955, nll_loss=2.441, ppl=5.43, wps=533329, ups=1.08, wpb=492620, bsz=16013.7, num_updates=8500, lr=0.000685994, gnorm=0.258, clip=0, loss_scale=4, train_wall=91, gb_free=21.2, wall=7887 epoch 006: 68 / 1689 loss=3.955, nll_loss=2.441, ppl=5.43, wps=533329, ups=1.08, wpb=492620, bsz=16013.7, num_updates=8500, lr=0.000685994, gnorm=0.258, clip=0, loss_scale=4, train_wall=91, gb_free=21.2, wall=7887 epoch 006: 68 / 1689 loss=3.955, nll_loss=2.441, ppl=5.43, wps=533329, ups=1.08, wpb=492620, bsz=16013.7, num_updates=8500, lr=0.000685994, gnorm=0.258, clip=0, loss_scale=4, train_wall=91, gb_free=21.2, wall=7887 epoch 006: 168 / 1689 loss=3.953, nll_loss=2.44, ppl=5.43, wps=551783, ups=1.11, wpb=495533, bsz=16295.3, num_updates=8600, lr=0.000681994, gnorm=0.256, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=7977 epoch 006: 168 / 1689 loss=3.953, nll_loss=2.44, ppl=5.43, wps=551783, ups=1.11, wpb=495533, bsz=16295.3, num_updates=8600, lr=0.000681994, gnorm=0.256, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=7977 epoch 006: 168 / 1689 loss=3.953, nll_loss=2.44, ppl=5.43, wps=551783, ups=1.11, wpb=495533, bsz=16295.3, num_updates=8600, lr=0.000681994, gnorm=0.256, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=7977 epoch 006: 168 / 1689 loss=3.953, nll_loss=2.44, ppl=5.43, wps=551783, ups=1.11, wpb=495533, bsz=16295.3, num_updates=8600, lr=0.000681994, gnorm=0.256, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=7977 epoch 006: 168 / 1689 loss=3.953, nll_loss=2.44, ppl=5.43, wps=551783, ups=1.11, wpb=495533, bsz=16295.3, num_updates=8600, lr=0.000681994, gnorm=0.256, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=7977 epoch 006: 168 / 1689 loss=3.953, nll_loss=2.44, ppl=5.43, wps=551783, ups=1.11, wpb=495533, bsz=16295.3, num_updates=8600, lr=0.000681994, gnorm=0.256, clip=0, loss_scale=4, train_wall=89, gb_free=20.8, wall=7977 epoch 006: 268 / 1689 loss=3.944, nll_loss=2.43, ppl=5.39, wps=552566, ups=1.11, wpb=495847, bsz=16619.6, num_updates=8700, lr=0.000678064, gnorm=0.249, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=8067 epoch 006: 268 / 1689 loss=3.944, nll_loss=2.43, ppl=5.39, wps=552566, ups=1.11, wpb=495847, bsz=16619.6, num_updates=8700, lr=0.000678064, gnorm=0.249, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=8067 epoch 006: 268 / 1689 loss=3.944, nll_loss=2.43, ppl=5.39, wps=552566, ups=1.11, wpb=495847, bsz=16619.6, num_updates=8700, lr=0.000678064, gnorm=0.249, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=8067 epoch 006: 268 / 1689 loss=3.944, nll_loss=2.43, ppl=5.39, wps=552566, ups=1.11, wpb=495847, bsz=16619.6, num_updates=8700, lr=0.000678064, gnorm=0.249, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=8067 epoch 006: 268 / 1689 loss=3.944, nll_loss=2.43, ppl=5.39, wps=552566, ups=1.11, wpb=495847, bsz=16619.6, num_updates=8700, lr=0.000678064, gnorm=0.249, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=8067 epoch 006: 268 / 1689 loss=3.944, nll_loss=2.43, ppl=5.39, wps=552566, ups=1.11, wpb=495847, bsz=16619.6, num_updates=8700, lr=0.000678064, gnorm=0.249, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=8067 epoch 006: 368 / 1689 loss=3.953, nll_loss=2.441, ppl=5.43, wps=550357, ups=1.11, wpb=494280, bsz=16324.2, num_updates=8800, lr=0.0006742, gnorm=0.26, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=8156 epoch 006: 368 / 1689 loss=3.953, nll_loss=2.441, ppl=5.43, wps=550357, ups=1.11, wpb=494280, bsz=16324.2, num_updates=8800, lr=0.0006742, gnorm=0.26, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=8156 epoch 006: 368 / 1689 loss=3.953, nll_loss=2.441, ppl=5.43, wps=550357, ups=1.11, wpb=494280, bsz=16324.2, num_updates=8800, lr=0.0006742, gnorm=0.26, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=8156 epoch 006: 368 / 1689 loss=3.953, nll_loss=2.441, ppl=5.43, wps=550357, ups=1.11, wpb=494280, bsz=16324.2, num_updates=8800, lr=0.0006742, gnorm=0.26, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=8156 epoch 006: 368 / 1689 loss=3.953, nll_loss=2.441, ppl=5.43, wps=550357, ups=1.11, wpb=494280, bsz=16324.2, num_updates=8800, lr=0.0006742, gnorm=0.26, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=8156 epoch 006: 368 / 1689 loss=3.953, nll_loss=2.441, ppl=5.43, wps=550357, ups=1.11, wpb=494280, bsz=16324.2, num_updates=8800, lr=0.0006742, gnorm=0.26, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=8156 epoch 006: 468 / 1689 loss=3.945, nll_loss=2.431, ppl=5.39, wps=550872, ups=1.11, wpb=496041, bsz=16477.8, num_updates=8900, lr=0.000670402, gnorm=0.257, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=8246 epoch 006: 468 / 1689 loss=3.945, nll_loss=2.431, ppl=5.39, wps=550872, ups=1.11, wpb=496041, bsz=16477.8, num_updates=8900, lr=0.000670402, gnorm=0.257, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=8246 epoch 006: 468 / 1689 loss=3.945, nll_loss=2.431, ppl=5.39, wps=550872, ups=1.11, wpb=496041, bsz=16477.8, num_updates=8900, lr=0.000670402, gnorm=0.257, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=8246 epoch 006: 468 / 1689 loss=3.945, nll_loss=2.431, ppl=5.39, wps=550872, ups=1.11, wpb=496041, bsz=16477.8, num_updates=8900, lr=0.000670402, gnorm=0.257, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=8246 epoch 006: 468 / 1689 loss=3.945, nll_loss=2.431, ppl=5.39, wps=550872, ups=1.11, wpb=496041, bsz=16477.8, num_updates=8900, lr=0.000670402, gnorm=0.257, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=8246 epoch 006: 468 / 1689 loss=3.945, nll_loss=2.431, ppl=5.39, wps=550872, ups=1.11, wpb=496041, bsz=16477.8, num_updates=8900, lr=0.000670402, gnorm=0.257, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=8246 epoch 006: 568 / 1689 loss=3.947, nll_loss=2.433, ppl=5.4, wps=550250, ups=1.11, wpb=495655, bsz=16606.7, num_updates=9000, lr=0.000666667, gnorm=0.251, clip=0, loss_scale=8, train_wall=89, gb_free=21.2, wall=8336 epoch 006: 568 / 1689 loss=3.947, nll_loss=2.433, ppl=5.4, wps=550250, ups=1.11, wpb=495655, bsz=16606.7, num_updates=9000, lr=0.000666667, gnorm=0.251, clip=0, loss_scale=8, train_wall=89, gb_free=21.2, wall=8336 epoch 006: 568 / 1689 loss=3.947, nll_loss=2.433, ppl=5.4, wps=550250, ups=1.11, wpb=495655, bsz=16606.7, num_updates=9000, lr=0.000666667, gnorm=0.251, clip=0, loss_scale=8, train_wall=89, gb_free=21.2, wall=8336 epoch 006: 568 / 1689 loss=3.947, nll_loss=2.433, ppl=5.4, wps=550250, ups=1.11, wpb=495655, bsz=16606.7, num_updates=9000, lr=0.000666667, gnorm=0.251, clip=0, loss_scale=8, train_wall=89, gb_free=21.2, wall=8336 epoch 006: 568 / 1689 loss=3.947, nll_loss=2.433, ppl=5.4, wps=550250, ups=1.11, wpb=495655, bsz=16606.7, num_updates=9000, lr=0.000666667, gnorm=0.251, clip=0, loss_scale=8, train_wall=89, gb_free=21.2, wall=8336 epoch 006: 568 / 1689 loss=3.947, nll_loss=2.433, ppl=5.4, wps=550250, ups=1.11, wpb=495655, bsz=16606.7, num_updates=9000, lr=0.000666667, gnorm=0.251, clip=0, loss_scale=8, train_wall=89, gb_free=21.2, wall=8336 begin validation on "valid" subset epoch 006 | valid on 'valid' subset | loss 3.893 | nll_loss 2.317 | ppl 4.98 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.893 epoch 006 | valid on 'valid' subset | loss 3.893 | nll_loss 2.317 | ppl 4.98 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.893 epoch 006 | valid on 'valid' subset | loss 3.893 | nll_loss 2.317 | ppl 4.98 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.893 epoch 006 | valid on 'valid' subset | loss 3.893 | nll_loss 2.317 | ppl 4.98 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.893 epoch 006 | valid on 'valid' subset | loss 3.893 | nll_loss 2.317 | ppl 4.98 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.893 epoch 006 | valid on 'valid' subset | loss 3.893 | nll_loss 2.317 | ppl 4.98 | wps 0 | wpb 44526 | bsz 2008 | num_updates 9000 | best_loss 3.893 epoch 006: 669 / 1689 loss=3.944, nll_loss=2.43, ppl=5.39, wps=450463, ups=0.91, wpb=494182, bsz=16803.5, num_updates=9100, lr=0.000662994, gnorm=0.254, clip=0, loss_scale=4, train_wall=90, gb_free=20.9, wall=8446 epoch 006: 669 / 1689 loss=3.944, nll_loss=2.43, ppl=5.39, wps=450463, ups=0.91, wpb=494182, bsz=16803.5, num_updates=9100, lr=0.000662994, gnorm=0.254, clip=0, loss_scale=4, train_wall=90, gb_free=20.9, wall=8446 epoch 006: 669 / 1689 loss=3.944, nll_loss=2.43, ppl=5.39, wps=450463, ups=0.91, wpb=494182, bsz=16803.5, num_updates=9100, lr=0.000662994, gnorm=0.254, clip=0, loss_scale=4, train_wall=90, gb_free=20.9, wall=8446 epoch 006: 669 / 1689 loss=3.944, nll_loss=2.43, ppl=5.39, wps=450463, ups=0.91, wpb=494182, bsz=16803.5, num_updates=9100, lr=0.000662994, gnorm=0.254, clip=0, loss_scale=4, train_wall=90, gb_free=20.9, wall=8446 epoch 006: 669 / 1689 loss=3.944, nll_loss=2.43, ppl=5.39, wps=450463, ups=0.91, wpb=494182, bsz=16803.5, num_updates=9100, lr=0.000662994, gnorm=0.254, clip=0, loss_scale=4, train_wall=90, gb_free=20.9, wall=8446 epoch 006: 669 / 1689 loss=3.944, nll_loss=2.43, ppl=5.39, wps=450463, ups=0.91, wpb=494182, bsz=16803.5, num_updates=9100, lr=0.000662994, gnorm=0.254, clip=0, loss_scale=4, train_wall=90, gb_free=20.9, wall=8446 epoch 006: 769 / 1689 loss=3.945, nll_loss=2.432, ppl=5.4, wps=554315, ups=1.12, wpb=495923, bsz=16416.4, num_updates=9200, lr=0.00065938, gnorm=0.249, clip=0, loss_scale=4, train_wall=88, gb_free=20.2, wall=8536 epoch 006: 769 / 1689 loss=3.945, nll_loss=2.432, ppl=5.4, wps=554315, ups=1.12, wpb=495923, bsz=16416.4, num_updates=9200, lr=0.00065938, gnorm=0.249, clip=0, loss_scale=4, train_wall=88, gb_free=20.2, wall=8536 epoch 006: 769 / 1689 loss=3.945, nll_loss=2.432, ppl=5.4, wps=554315, ups=1.12, wpb=495923, bsz=16416.4, num_updates=9200, lr=0.00065938, gnorm=0.249, clip=0, loss_scale=4, train_wall=88, gb_free=20.2, wall=8536 epoch 006: 769 / 1689 loss=3.945, nll_loss=2.432, ppl=5.4, wps=554315, ups=1.12, wpb=495923, bsz=16416.4, num_updates=9200, lr=0.00065938, gnorm=0.249, clip=0, loss_scale=4, train_wall=88, gb_free=20.2, wall=8536 epoch 006: 769 / 1689 loss=3.945, nll_loss=2.432, ppl=5.4, wps=554315, ups=1.12, wpb=495923, bsz=16416.4, num_updates=9200, lr=0.00065938, gnorm=0.249, clip=0, loss_scale=4, train_wall=88, gb_free=20.2, wall=8536 epoch 006: 769 / 1689 loss=3.945, nll_loss=2.432, ppl=5.4, wps=554315, ups=1.12, wpb=495923, bsz=16416.4, num_updates=9200, lr=0.00065938, gnorm=0.249, clip=0, loss_scale=4, train_wall=88, gb_free=20.2, wall=8536 epoch 006: 870 / 1689 loss=3.945, nll_loss=2.432, ppl=5.4, wps=545023, ups=1.1, wpb=495075, bsz=16930.9, num_updates=9300, lr=0.000655826, gnorm=0.27, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=8627 epoch 006: 870 / 1689 loss=3.945, nll_loss=2.432, ppl=5.4, wps=545023, ups=1.1, wpb=495075, bsz=16930.9, num_updates=9300, lr=0.000655826, gnorm=0.27, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=8627 epoch 006: 870 / 1689 loss=3.945, nll_loss=2.432, ppl=5.4, wps=545023, ups=1.1, wpb=495075, bsz=16930.9, num_updates=9300, lr=0.000655826, gnorm=0.27, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=8627 epoch 006: 870 / 1689 loss=3.945, nll_loss=2.432, ppl=5.4, wps=545023, ups=1.1, wpb=495075, bsz=16930.9, num_updates=9300, lr=0.000655826, gnorm=0.27, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=8627 epoch 006: 870 / 1689 loss=3.945, nll_loss=2.432, ppl=5.4, wps=545023, ups=1.1, wpb=495075, bsz=16930.9, num_updates=9300, lr=0.000655826, gnorm=0.27, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=8627 epoch 006: 870 / 1689 loss=3.945, nll_loss=2.432, ppl=5.4, wps=545023, ups=1.1, wpb=495075, bsz=16930.9, num_updates=9300, lr=0.000655826, gnorm=0.27, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=8627 epoch 006: 970 / 1689 loss=3.948, nll_loss=2.435, ppl=5.41, wps=547946, ups=1.11, wpb=493156, bsz=16292.8, num_updates=9400, lr=0.000652328, gnorm=0.254, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=8717 epoch 006: 970 / 1689 loss=3.948, nll_loss=2.435, ppl=5.41, wps=547946, ups=1.11, wpb=493156, bsz=16292.8, num_updates=9400, lr=0.000652328, gnorm=0.254, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=8717 epoch 006: 970 / 1689 loss=3.948, nll_loss=2.435, ppl=5.41, wps=547946, ups=1.11, wpb=493156, bsz=16292.8, num_updates=9400, lr=0.000652328, gnorm=0.254, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=8717 epoch 006: 970 / 1689 loss=3.948, nll_loss=2.435, ppl=5.41, wps=547946, ups=1.11, wpb=493156, bsz=16292.8, num_updates=9400, lr=0.000652328, gnorm=0.254, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=8717 epoch 006: 970 / 1689 loss=3.948, nll_loss=2.435, ppl=5.41, wps=547946, ups=1.11, wpb=493156, bsz=16292.8, num_updates=9400, lr=0.000652328, gnorm=0.254, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=8717 epoch 006: 970 / 1689 loss=3.948, nll_loss=2.435, ppl=5.41, wps=547946, ups=1.11, wpb=493156, bsz=16292.8, num_updates=9400, lr=0.000652328, gnorm=0.254, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=8717 epoch 006: 1070 / 1689 loss=3.939, nll_loss=2.425, ppl=5.37, wps=550128, ups=1.11, wpb=494574, bsz=16490.6, num_updates=9500, lr=0.000648886, gnorm=0.247, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=8806 epoch 006: 1070 / 1689 loss=3.939, nll_loss=2.425, ppl=5.37, wps=550128, ups=1.11, wpb=494574, bsz=16490.6, num_updates=9500, lr=0.000648886, gnorm=0.247, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=8806 epoch 006: 1070 / 1689 loss=3.939, nll_loss=2.425, ppl=5.37, wps=550128, ups=1.11, wpb=494574, bsz=16490.6, num_updates=9500, lr=0.000648886, gnorm=0.247, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=8806 epoch 006: 1070 / 1689 loss=3.939, nll_loss=2.425, ppl=5.37, wps=550128, ups=1.11, wpb=494574, bsz=16490.6, num_updates=9500, lr=0.000648886, gnorm=0.247, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=8806 epoch 006: 1070 / 1689 loss=3.939, nll_loss=2.425, ppl=5.37, wps=550128, ups=1.11, wpb=494574, bsz=16490.6, num_updates=9500, lr=0.000648886, gnorm=0.247, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=8806 epoch 006: 1070 / 1689 loss=3.939, nll_loss=2.425, ppl=5.37, wps=550128, ups=1.11, wpb=494574, bsz=16490.6, num_updates=9500, lr=0.000648886, gnorm=0.247, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=8806 epoch 006: 1170 / 1689 loss=3.937, nll_loss=2.424, ppl=5.37, wps=552561, ups=1.12, wpb=494476, bsz=16754.3, num_updates=9600, lr=0.000645497, gnorm=0.254, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=8896 epoch 006: 1170 / 1689 loss=3.937, nll_loss=2.424, ppl=5.37, wps=552561, ups=1.12, wpb=494476, bsz=16754.3, num_updates=9600, lr=0.000645497, gnorm=0.254, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=8896 epoch 006: 1170 / 1689 loss=3.937, nll_loss=2.424, ppl=5.37, wps=552561, ups=1.12, wpb=494476, bsz=16754.3, num_updates=9600, lr=0.000645497, gnorm=0.254, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=8896 epoch 006: 1170 / 1689 loss=3.937, nll_loss=2.424, ppl=5.37, wps=552561, ups=1.12, wpb=494476, bsz=16754.3, num_updates=9600, lr=0.000645497, gnorm=0.254, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=8896 epoch 006: 1170 / 1689 loss=3.937, nll_loss=2.424, ppl=5.37, wps=552561, ups=1.12, wpb=494476, bsz=16754.3, num_updates=9600, lr=0.000645497, gnorm=0.254, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=8896 epoch 006: 1170 / 1689 loss=3.937, nll_loss=2.424, ppl=5.37, wps=552561, ups=1.12, wpb=494476, bsz=16754.3, num_updates=9600, lr=0.000645497, gnorm=0.254, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=8896 epoch 006: 1270 / 1689 loss=3.942, nll_loss=2.43, ppl=5.39, wps=553956, ups=1.12, wpb=495553, bsz=16170.1, num_updates=9700, lr=0.000642161, gnorm=0.252, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=8985 epoch 006: 1270 / 1689 loss=3.942, nll_loss=2.43, ppl=5.39, wps=553956, ups=1.12, wpb=495553, bsz=16170.1, num_updates=9700, lr=0.000642161, gnorm=0.252, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=8985 epoch 006: 1270 / 1689 loss=3.942, nll_loss=2.43, ppl=5.39, wps=553956, ups=1.12, wpb=495553, bsz=16170.1, num_updates=9700, lr=0.000642161, gnorm=0.252, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=8985 epoch 006: 1270 / 1689 loss=3.942, nll_loss=2.43, ppl=5.39, wps=553956, ups=1.12, wpb=495553, bsz=16170.1, num_updates=9700, lr=0.000642161, gnorm=0.252, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=8985 epoch 006: 1270 / 1689 loss=3.942, nll_loss=2.43, ppl=5.39, wps=553956, ups=1.12, wpb=495553, bsz=16170.1, num_updates=9700, lr=0.000642161, gnorm=0.252, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=8985 epoch 006: 1270 / 1689 loss=3.942, nll_loss=2.43, ppl=5.39, wps=553956, ups=1.12, wpb=495553, bsz=16170.1, num_updates=9700, lr=0.000642161, gnorm=0.252, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=8985 epoch 006: 1370 / 1689 loss=3.931, nll_loss=2.417, ppl=5.34, wps=552688, ups=1.11, wpb=497314, bsz=16725.4, num_updates=9800, lr=0.000638877, gnorm=0.252, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=9075 epoch 006: 1370 / 1689 loss=3.931, nll_loss=2.417, ppl=5.34, wps=552688, ups=1.11, wpb=497314, bsz=16725.4, num_updates=9800, lr=0.000638877, gnorm=0.252, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=9075 epoch 006: 1370 / 1689 loss=3.931, nll_loss=2.417, ppl=5.34, wps=552688, ups=1.11, wpb=497314, bsz=16725.4, num_updates=9800, lr=0.000638877, gnorm=0.252, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=9075 epoch 006: 1370 / 1689 loss=3.931, nll_loss=2.417, ppl=5.34, wps=552688, ups=1.11, wpb=497314, bsz=16725.4, num_updates=9800, lr=0.000638877, gnorm=0.252, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=9075 epoch 006: 1370 / 1689 loss=3.931, nll_loss=2.417, ppl=5.34, wps=552688, ups=1.11, wpb=497314, bsz=16725.4, num_updates=9800, lr=0.000638877, gnorm=0.252, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=9075 epoch 006: 1370 / 1689 loss=3.931, nll_loss=2.417, ppl=5.34, wps=552688, ups=1.11, wpb=497314, bsz=16725.4, num_updates=9800, lr=0.000638877, gnorm=0.252, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=9075 epoch 006: 1470 / 1689 loss=3.938, nll_loss=2.425, ppl=5.37, wps=549533, ups=1.11, wpb=494850, bsz=16358.3, num_updates=9900, lr=0.000635642, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=9165 epoch 006: 1470 / 1689 loss=3.938, nll_loss=2.425, ppl=5.37, wps=549533, ups=1.11, wpb=494850, bsz=16358.3, num_updates=9900, lr=0.000635642, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=9165 epoch 006: 1470 / 1689 loss=3.938, nll_loss=2.425, ppl=5.37, wps=549533, ups=1.11, wpb=494850, bsz=16358.3, num_updates=9900, lr=0.000635642, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=9165 epoch 006: 1470 / 1689 loss=3.938, nll_loss=2.425, ppl=5.37, wps=549533, ups=1.11, wpb=494850, bsz=16358.3, num_updates=9900, lr=0.000635642, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=9165 epoch 006: 1470 / 1689 loss=3.938, nll_loss=2.425, ppl=5.37, wps=549533, ups=1.11, wpb=494850, bsz=16358.3, num_updates=9900, lr=0.000635642, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=9165 epoch 006: 1470 / 1689 loss=3.938, nll_loss=2.425, ppl=5.37, wps=549533, ups=1.11, wpb=494850, bsz=16358.3, num_updates=9900, lr=0.000635642, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=9165 epoch 006: 1570 / 1689 loss=3.926, nll_loss=2.412, ppl=5.32, wps=546225, ups=1.1, wpb=495856, bsz=16750.3, num_updates=10000, lr=0.000632456, gnorm=0.238, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=9256 epoch 006: 1570 / 1689 loss=3.926, nll_loss=2.412, ppl=5.32, wps=546225, ups=1.1, wpb=495856, bsz=16750.3, num_updates=10000, lr=0.000632456, gnorm=0.238, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=9256 epoch 006: 1570 / 1689 loss=3.926, nll_loss=2.412, ppl=5.32, wps=546225, ups=1.1, wpb=495856, bsz=16750.3, num_updates=10000, lr=0.000632456, gnorm=0.238, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=9256 epoch 006: 1570 / 1689 loss=3.926, nll_loss=2.412, ppl=5.32, wps=546225, ups=1.1, wpb=495856, bsz=16750.3, num_updates=10000, lr=0.000632456, gnorm=0.238, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=9256 epoch 006: 1570 / 1689 loss=3.926, nll_loss=2.412, ppl=5.32, wps=546225, ups=1.1, wpb=495856, bsz=16750.3, num_updates=10000, lr=0.000632456, gnorm=0.238, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=9256 epoch 006: 1570 / 1689 loss=3.926, nll_loss=2.412, ppl=5.32, wps=546225, ups=1.1, wpb=495856, bsz=16750.3, num_updates=10000, lr=0.000632456, gnorm=0.238, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=9256 begin validation on "valid" subset epoch 006 | valid on 'valid' subset | loss 3.876 | nll_loss 2.293 | ppl 4.9 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.876 epoch 006 | valid on 'valid' subset | loss 3.876 | nll_loss 2.293 | ppl 4.9 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.876 epoch 006 | valid on 'valid' subset | loss 3.876 | nll_loss 2.293 | ppl 4.9 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.876 epoch 006 | valid on 'valid' subset | loss 3.876 | nll_loss 2.293 | ppl 4.9 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.876 epoch 006 | valid on 'valid' subset | loss 3.876 | nll_loss 2.293 | ppl 4.9 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.876 epoch 006 | valid on 'valid' subset | loss 3.876 | nll_loss 2.293 | ppl 4.9 | wps 0 | wpb 44526 | bsz 2008 | num_updates 10000 | best_loss 3.876 epoch 006: 1670 / 1689 loss=3.929, nll_loss=2.415, ppl=5.33, wps=417183, ups=0.84, wpb=495675, bsz=16481.7, num_updates=10100, lr=0.000629317, gnorm=0.243, clip=0, loss_scale=4, train_wall=93, gb_free=21.3, wall=9375 epoch 006: 1670 / 1689 loss=3.929, nll_loss=2.415, ppl=5.33, wps=417183, ups=0.84, wpb=495675, bsz=16481.7, num_updates=10100, lr=0.000629317, gnorm=0.243, clip=0, loss_scale=4, train_wall=93, gb_free=21.3, wall=9375 epoch 006: 1670 / 1689 loss=3.929, nll_loss=2.415, ppl=5.33, wps=417183, ups=0.84, wpb=495675, bsz=16481.7, num_updates=10100, lr=0.000629317, gnorm=0.243, clip=0, loss_scale=4, train_wall=93, gb_free=21.3, wall=9375 epoch 006: 1670 / 1689 loss=3.929, nll_loss=2.415, ppl=5.33, wps=417183, ups=0.84, wpb=495675, bsz=16481.7, num_updates=10100, lr=0.000629317, gnorm=0.243, clip=0, loss_scale=4, train_wall=93, gb_free=21.3, wall=9375 epoch 006: 1670 / 1689 loss=3.929, nll_loss=2.415, ppl=5.33, wps=417183, ups=0.84, wpb=495675, bsz=16481.7, num_updates=10100, lr=0.000629317, gnorm=0.243, clip=0, loss_scale=4, train_wall=93, gb_free=21.3, wall=9375 epoch 006: 1670 / 1689 loss=3.929, nll_loss=2.415, ppl=5.33, wps=417183, ups=0.84, wpb=495675, bsz=16481.7, num_updates=10100, lr=0.000629317, gnorm=0.243, clip=0, loss_scale=4, train_wall=93, gb_free=21.3, wall=9375 end of epoch 6 (average epoch stats below) epoch 006 | loss 3.942 | nll_loss 2.428 | ppl 5.38 | wps 532521 | ups 1.08 | wpb 495123 | bsz 16505.9 | num_updates 10119 | lr 0.000628726 | gnorm 0.252 | clip 0 | loss_scale 4 | train_wall 1503 | gb_free 22.8 | wall 9391 epoch 006 | loss 3.942 | nll_loss 2.428 | ppl 5.38 | wps 532521 | ups 1.08 | wpb 495123 | bsz 16505.9 | num_updates 10119 | lr 0.000628726 | gnorm 0.252 | clip 0 | loss_scale 4 | train_wall 1503 | gb_free 22.8 | wall 9391 epoch 006 | loss 3.942 | nll_loss 2.428 | ppl 5.38 | wps 532521 | ups 1.08 | wpb 495123 | bsz 16505.9 | num_updates 10119 | lr 0.000628726 | gnorm 0.252 | clip 0 | loss_scale 4 | train_wall 1503 | gb_free 22.8 | wall 9391 epoch 006 | loss 3.942 | nll_loss 2.428 | ppl 5.38 | wps 532521 | ups 1.08 | wpb 495123 | bsz 16505.9 | num_updates 10119 | lr 0.000628726 | gnorm 0.252 | clip 0 | loss_scale 4 | train_wall 1503 | gb_free 22.8 | wall 9391 epoch 006 | loss 3.942 | nll_loss 2.428 | ppl 5.38 | wps 532521 | ups 1.08 | wpb 495123 | bsz 16505.9 | num_updates 10119 | lr 0.000628726 | gnorm 0.252 | clip 0 | loss_scale 4 | train_wall 1503 | gb_free 22.8 | wall 9391 epoch 006 | loss 3.942 | nll_loss 2.428 | ppl 5.38 | wps 532521 | ups 1.08 | wpb 495123 | bsz 16505.9 | num_updates 10119 | lr 0.000628726 | gnorm 0.252 | clip 0 | loss_scale 4 | train_wall 1503 | gb_free 22.8 | wall 9391 Start iterating over samples epoch 007: 81 / 1689 loss=3.913, nll_loss=2.396, ppl=5.26, wps=543713, ups=1.11, wpb=490479, bsz=16313.8, num_updates=10200, lr=0.000626224, gnorm=0.248, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=9465 epoch 007: 81 / 1689 loss=3.913, nll_loss=2.396, ppl=5.26, wps=543713, ups=1.11, wpb=490479, bsz=16313.8, num_updates=10200, lr=0.000626224, gnorm=0.248, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=9465 epoch 007: 81 / 1689 loss=3.913, nll_loss=2.396, ppl=5.26, wps=543713, ups=1.11, wpb=490479, bsz=16313.8, num_updates=10200, lr=0.000626224, gnorm=0.248, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=9465 epoch 007: 81 / 1689 loss=3.913, nll_loss=2.396, ppl=5.26, wps=543713, ups=1.11, wpb=490479, bsz=16313.8, num_updates=10200, lr=0.000626224, gnorm=0.248, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=9465 epoch 007: 81 / 1689 loss=3.913, nll_loss=2.396, ppl=5.26, wps=543713, ups=1.11, wpb=490479, bsz=16313.8, num_updates=10200, lr=0.000626224, gnorm=0.248, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=9465 epoch 007: 81 / 1689 loss=3.913, nll_loss=2.396, ppl=5.26, wps=543713, ups=1.11, wpb=490479, bsz=16313.8, num_updates=10200, lr=0.000626224, gnorm=0.248, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=9465 epoch 007: 81 / 1689 loss=3.913, nll_loss=2.396, ppl=5.26, wps=543713, ups=1.11, wpb=490479, bsz=16313.8, num_updates=10200, lr=0.000626224, gnorm=0.248, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=9465 epoch 007: 181 / 1689 loss=3.898, nll_loss=2.38, ppl=5.21, wps=546554, ups=1.1, wpb=495900, bsz=16682.6, num_updates=10300, lr=0.000623177, gnorm=0.236, clip=0, loss_scale=8, train_wall=89, gb_free=19.8, wall=9556 epoch 007: 181 / 1689 loss=3.898, nll_loss=2.38, ppl=5.21, wps=546554, ups=1.1, wpb=495900, bsz=16682.6, num_updates=10300, lr=0.000623177, gnorm=0.236, clip=0, loss_scale=8, train_wall=89, gb_free=19.8, wall=9556 epoch 007: 181 / 1689 loss=3.898, nll_loss=2.38, ppl=5.21, wps=546554, ups=1.1, wpb=495900, bsz=16682.6, num_updates=10300, lr=0.000623177, gnorm=0.236, clip=0, loss_scale=8, train_wall=89, gb_free=19.8, wall=9556 epoch 007: 181 / 1689 loss=3.898, nll_loss=2.38, ppl=5.21, wps=546554, ups=1.1, wpb=495900, bsz=16682.6, num_updates=10300, lr=0.000623177, gnorm=0.236, clip=0, loss_scale=8, train_wall=89, gb_free=19.8, wall=9556 epoch 007: 181 / 1689 loss=3.898, nll_loss=2.38, ppl=5.21, wps=546554, ups=1.1, wpb=495900, bsz=16682.6, num_updates=10300, lr=0.000623177, gnorm=0.236, clip=0, loss_scale=8, train_wall=89, gb_free=19.8, wall=9556 epoch 007: 181 / 1689 loss=3.898, nll_loss=2.38, ppl=5.21, wps=546554, ups=1.1, wpb=495900, bsz=16682.6, num_updates=10300, lr=0.000623177, gnorm=0.236, clip=0, loss_scale=8, train_wall=89, gb_free=19.8, wall=9556 epoch 007: 181 / 1689 loss=3.898, nll_loss=2.38, ppl=5.21, wps=546554, ups=1.1, wpb=495900, bsz=16682.6, num_updates=10300, lr=0.000623177, gnorm=0.236, clip=0, loss_scale=8, train_wall=89, gb_free=19.8, wall=9556 epoch 007: 282 / 1689 loss=3.904, nll_loss=2.386, ppl=5.23, wps=539614, ups=1.09, wpb=495332, bsz=16567, num_updates=10400, lr=0.000620174, gnorm=0.242, clip=0, loss_scale=4, train_wall=90, gb_free=22, wall=9648 epoch 007: 282 / 1689 loss=3.904, nll_loss=2.386, ppl=5.23, wps=539614, ups=1.09, wpb=495332, bsz=16567, num_updates=10400, lr=0.000620174, gnorm=0.242, clip=0, loss_scale=4, train_wall=90, gb_free=22, wall=9648 epoch 007: 282 / 1689 loss=3.904, nll_loss=2.386, ppl=5.23, wps=539614, ups=1.09, wpb=495332, bsz=16567, num_updates=10400, lr=0.000620174, gnorm=0.242, clip=0, loss_scale=4, train_wall=90, gb_free=22, wall=9648 epoch 007: 282 / 1689 loss=3.904, nll_loss=2.386, ppl=5.23, wps=539614, ups=1.09, wpb=495332, bsz=16567, num_updates=10400, lr=0.000620174, gnorm=0.242, clip=0, loss_scale=4, train_wall=90, gb_free=22, wall=9648 epoch 007: 282 / 1689 loss=3.904, nll_loss=2.386, ppl=5.23, wps=539614, ups=1.09, wpb=495332, bsz=16567, num_updates=10400, lr=0.000620174, gnorm=0.242, clip=0, loss_scale=4, train_wall=90, gb_free=22, wall=9648 epoch 007: 282 / 1689 loss=3.904, nll_loss=2.386, ppl=5.23, wps=539614, ups=1.09, wpb=495332, bsz=16567, num_updates=10400, lr=0.000620174, gnorm=0.242, clip=0, loss_scale=4, train_wall=90, gb_free=22, wall=9648 epoch 007: 282 / 1689 loss=3.904, nll_loss=2.386, ppl=5.23, wps=539614, ups=1.09, wpb=495332, bsz=16567, num_updates=10400, lr=0.000620174, gnorm=0.242, clip=0, loss_scale=4, train_wall=90, gb_free=22, wall=9648 epoch 007: 382 / 1689 loss=3.909, nll_loss=2.393, ppl=5.25, wps=551244, ups=1.11, wpb=495481, bsz=16103.1, num_updates=10500, lr=0.000617213, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=9738 epoch 007: 382 / 1689 loss=3.909, nll_loss=2.393, ppl=5.25, wps=551244, ups=1.11, wpb=495481, bsz=16103.1, num_updates=10500, lr=0.000617213, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=9738 epoch 007: 382 / 1689 loss=3.909, nll_loss=2.393, ppl=5.25, wps=551244, ups=1.11, wpb=495481, bsz=16103.1, num_updates=10500, lr=0.000617213, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=9738 epoch 007: 382 / 1689 loss=3.909, nll_loss=2.393, ppl=5.25, wps=551244, ups=1.11, wpb=495481, bsz=16103.1, num_updates=10500, lr=0.000617213, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=9738 epoch 007: 382 / 1689 loss=3.909, nll_loss=2.393, ppl=5.25, wps=551244, ups=1.11, wpb=495481, bsz=16103.1, num_updates=10500, lr=0.000617213, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=9738 epoch 007: 382 / 1689 loss=3.909, nll_loss=2.393, ppl=5.25, wps=551244, ups=1.11, wpb=495481, bsz=16103.1, num_updates=10500, lr=0.000617213, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=9738 epoch 007: 382 / 1689 loss=3.909, nll_loss=2.393, ppl=5.25, wps=551244, ups=1.11, wpb=495481, bsz=16103.1, num_updates=10500, lr=0.000617213, gnorm=0.242, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=9738 epoch 007: 482 / 1689 loss=3.912, nll_loss=2.396, ppl=5.26, wps=547305, ups=1.11, wpb=494968, bsz=16600.6, num_updates=10600, lr=0.000614295, gnorm=0.245, clip=0, loss_scale=4, train_wall=89, gb_free=20.4, wall=9828 epoch 007: 482 / 1689 loss=3.912, nll_loss=2.396, ppl=5.26, wps=547305, ups=1.11, wpb=494968, bsz=16600.6, num_updates=10600, lr=0.000614295, gnorm=0.245, clip=0, loss_scale=4, train_wall=89, gb_free=20.4, wall=9828 epoch 007: 482 / 1689 loss=3.912, nll_loss=2.396, ppl=5.26, wps=547305, ups=1.11, wpb=494968, bsz=16600.6, num_updates=10600, lr=0.000614295, gnorm=0.245, clip=0, loss_scale=4, train_wall=89, gb_free=20.4, wall=9828 epoch 007: 482 / 1689 loss=3.912, nll_loss=2.396, ppl=5.26, wps=547305, ups=1.11, wpb=494968, bsz=16600.6, num_updates=10600, lr=0.000614295, gnorm=0.245, clip=0, loss_scale=4, train_wall=89, gb_free=20.4, wall=9828 epoch 007: 482 / 1689 loss=3.912, nll_loss=2.396, ppl=5.26, wps=547305, ups=1.11, wpb=494968, bsz=16600.6, num_updates=10600, lr=0.000614295, gnorm=0.245, clip=0, loss_scale=4, train_wall=89, gb_free=20.4, wall=9828 epoch 007: 482 / 1689 loss=3.912, nll_loss=2.396, ppl=5.26, wps=547305, ups=1.11, wpb=494968, bsz=16600.6, num_updates=10600, lr=0.000614295, gnorm=0.245, clip=0, loss_scale=4, train_wall=89, gb_free=20.4, wall=9828 epoch 007: 482 / 1689 loss=3.912, nll_loss=2.396, ppl=5.26, wps=547305, ups=1.11, wpb=494968, bsz=16600.6, num_updates=10600, lr=0.000614295, gnorm=0.245, clip=0, loss_scale=4, train_wall=89, gb_free=20.4, wall=9828 epoch 007: 582 / 1689 loss=3.904, nll_loss=2.388, ppl=5.23, wps=549990, ups=1.11, wpb=496079, bsz=16431.6, num_updates=10700, lr=0.000611418, gnorm=0.243, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=9918 epoch 007: 582 / 1689 loss=3.904, nll_loss=2.388, ppl=5.23, wps=549990, ups=1.11, wpb=496079, bsz=16431.6, num_updates=10700, lr=0.000611418, gnorm=0.243, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=9918 epoch 007: 582 / 1689 loss=3.904, nll_loss=2.388, ppl=5.23, wps=549990, ups=1.11, wpb=496079, bsz=16431.6, num_updates=10700, lr=0.000611418, gnorm=0.243, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=9918 epoch 007: 582 / 1689 loss=3.904, nll_loss=2.388, ppl=5.23, wps=549990, ups=1.11, wpb=496079, bsz=16431.6, num_updates=10700, lr=0.000611418, gnorm=0.243, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=9918 epoch 007: 582 / 1689 loss=3.904, nll_loss=2.388, ppl=5.23, wps=549990, ups=1.11, wpb=496079, bsz=16431.6, num_updates=10700, lr=0.000611418, gnorm=0.243, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=9918 epoch 007: 582 / 1689 loss=3.904, nll_loss=2.388, ppl=5.23, wps=549990, ups=1.11, wpb=496079, bsz=16431.6, num_updates=10700, lr=0.000611418, gnorm=0.243, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=9918 epoch 007: 582 / 1689 loss=3.904, nll_loss=2.388, ppl=5.23, wps=549990, ups=1.11, wpb=496079, bsz=16431.6, num_updates=10700, lr=0.000611418, gnorm=0.243, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=9918 epoch 007: 682 / 1689 loss=3.907, nll_loss=2.39, ppl=5.24, wps=551530, ups=1.11, wpb=494892, bsz=16297.5, num_updates=10800, lr=0.000608581, gnorm=0.238, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=10008 epoch 007: 682 / 1689 loss=3.907, nll_loss=2.39, ppl=5.24, wps=551530, ups=1.11, wpb=494892, bsz=16297.5, num_updates=10800, lr=0.000608581, gnorm=0.238, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=10008 epoch 007: 682 / 1689 loss=3.907, nll_loss=2.39, ppl=5.24, wps=551530, ups=1.11, wpb=494892, bsz=16297.5, num_updates=10800, lr=0.000608581, gnorm=0.238, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=10008 epoch 007: 682 / 1689 loss=3.907, nll_loss=2.39, ppl=5.24, wps=551530, ups=1.11, wpb=494892, bsz=16297.5, num_updates=10800, lr=0.000608581, gnorm=0.238, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=10008 epoch 007: 682 / 1689 loss=3.907, nll_loss=2.39, ppl=5.24, wps=551530, ups=1.11, wpb=494892, bsz=16297.5, num_updates=10800, lr=0.000608581, gnorm=0.238, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=10008 epoch 007: 682 / 1689 loss=3.907, nll_loss=2.39, ppl=5.24, wps=551530, ups=1.11, wpb=494892, bsz=16297.5, num_updates=10800, lr=0.000608581, gnorm=0.238, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=10008 epoch 007: 682 / 1689 loss=3.907, nll_loss=2.39, ppl=5.24, wps=551530, ups=1.11, wpb=494892, bsz=16297.5, num_updates=10800, lr=0.000608581, gnorm=0.238, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=10008 epoch 007: 782 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=547475, ups=1.11, wpb=494334, bsz=16265.9, num_updates=10900, lr=0.000605783, gnorm=0.232, clip=0, loss_scale=8, train_wall=88, gb_free=21.1, wall=10098 epoch 007: 782 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=547475, ups=1.11, wpb=494334, bsz=16265.9, num_updates=10900, lr=0.000605783, gnorm=0.232, clip=0, loss_scale=8, train_wall=88, gb_free=21.1, wall=10098 epoch 007: 782 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=547475, ups=1.11, wpb=494334, bsz=16265.9, num_updates=10900, lr=0.000605783, gnorm=0.232, clip=0, loss_scale=8, train_wall=88, gb_free=21.1, wall=10098 epoch 007: 782 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=547475, ups=1.11, wpb=494334, bsz=16265.9, num_updates=10900, lr=0.000605783, gnorm=0.232, clip=0, loss_scale=8, train_wall=88, gb_free=21.1, wall=10098 epoch 007: 782 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=547475, ups=1.11, wpb=494334, bsz=16265.9, num_updates=10900, lr=0.000605783, gnorm=0.232, clip=0, loss_scale=8, train_wall=88, gb_free=21.1, wall=10098 epoch 007: 782 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=547475, ups=1.11, wpb=494334, bsz=16265.9, num_updates=10900, lr=0.000605783, gnorm=0.232, clip=0, loss_scale=8, train_wall=88, gb_free=21.1, wall=10098 epoch 007: 782 / 1689 loss=3.905, nll_loss=2.389, ppl=5.24, wps=547475, ups=1.11, wpb=494334, bsz=16265.9, num_updates=10900, lr=0.000605783, gnorm=0.232, clip=0, loss_scale=8, train_wall=88, gb_free=21.1, wall=10098 epoch 007: 882 / 1689 loss=3.903, nll_loss=2.387, ppl=5.23, wps=547634, ups=1.11, wpb=494533, bsz=16442.2, num_updates=11000, lr=0.000603023, gnorm=0.24, clip=0, loss_scale=8, train_wall=89, gb_free=21.8, wall=10189 epoch 007: 882 / 1689 loss=3.903, nll_loss=2.387, ppl=5.23, wps=547634, ups=1.11, wpb=494533, bsz=16442.2, num_updates=11000, lr=0.000603023, gnorm=0.24, clip=0, loss_scale=8, train_wall=89, gb_free=21.8, wall=10189 epoch 007: 882 / 1689 loss=3.903, nll_loss=2.387, ppl=5.23, wps=547634, ups=1.11, wpb=494533, bsz=16442.2, num_updates=11000, lr=0.000603023, gnorm=0.24, clip=0, loss_scale=8, train_wall=89, gb_free=21.8, wall=10189 epoch 007: 882 / 1689 loss=3.903, nll_loss=2.387, ppl=5.23, wps=547634, ups=1.11, wpb=494533, bsz=16442.2, num_updates=11000, lr=0.000603023, gnorm=0.24, clip=0, loss_scale=8, train_wall=89, gb_free=21.8, wall=10189 epoch 007: 882 / 1689 loss=3.903, nll_loss=2.387, ppl=5.23, wps=547634, ups=1.11, wpb=494533, bsz=16442.2, num_updates=11000, lr=0.000603023, gnorm=0.24, clip=0, loss_scale=8, train_wall=89, gb_free=21.8, wall=10189 epoch 007: 882 / 1689 loss=3.903, nll_loss=2.387, ppl=5.23, wps=547634, ups=1.11, wpb=494533, bsz=16442.2, num_updates=11000, lr=0.000603023, gnorm=0.24, clip=0, loss_scale=8, train_wall=89, gb_free=21.8, wall=10189 epoch 007: 882 / 1689 loss=3.903, nll_loss=2.387, ppl=5.23, wps=547634, ups=1.11, wpb=494533, bsz=16442.2, num_updates=11000, lr=0.000603023, gnorm=0.24, clip=0, loss_scale=8, train_wall=89, gb_free=21.8, wall=10189 begin validation on "valid" subset epoch 007 | valid on 'valid' subset | loss 3.862 | nll_loss 2.285 | ppl 4.87 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.862 epoch 007 | valid on 'valid' subset | loss 3.862 | nll_loss 2.285 | ppl 4.87 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.862 epoch 007 | valid on 'valid' subset | loss 3.862 | nll_loss 2.285 | ppl 4.87 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.862 epoch 007 | valid on 'valid' subset | loss 3.862 | nll_loss 2.285 | ppl 4.87 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.862 epoch 007 | valid on 'valid' subset | loss 3.862 | nll_loss 2.285 | ppl 4.87 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.862 epoch 007 | valid on 'valid' subset | loss 3.862 | nll_loss 2.285 | ppl 4.87 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.862 epoch 007 | valid on 'valid' subset | loss 3.862 | nll_loss 2.285 | ppl 4.87 | wps 0 | wpb 44526 | bsz 2008 | num_updates 11000 | best_loss 3.862 epoch 007: 984 / 1689 loss=3.902, nll_loss=2.385, ppl=5.22, wps=352267, ups=0.71, wpb=496532, bsz=16175.7, num_updates=11100, lr=0.0006003, gnorm=0.237, clip=0, loss_scale=2, train_wall=115, gb_free=21.8, wall=10330 epoch 007: 984 / 1689 loss=3.902, nll_loss=2.385, ppl=5.22, wps=352267, ups=0.71, wpb=496532, bsz=16175.7, num_updates=11100, lr=0.0006003, gnorm=0.237, clip=0, loss_scale=2, train_wall=115, gb_free=21.8, wall=10330 epoch 007: 984 / 1689 loss=3.902, nll_loss=2.385, ppl=5.22, wps=352267, ups=0.71, wpb=496532, bsz=16175.7, num_updates=11100, lr=0.0006003, gnorm=0.237, clip=0, loss_scale=2, train_wall=115, gb_free=21.8, wall=10330 epoch 007: 984 / 1689 loss=3.902, nll_loss=2.385, ppl=5.22, wps=352267, ups=0.71, wpb=496532, bsz=16175.7, num_updates=11100, lr=0.0006003, gnorm=0.237, clip=0, loss_scale=2, train_wall=115, gb_free=21.8, wall=10330 epoch 007: 984 / 1689 loss=3.902, nll_loss=2.385, ppl=5.22, wps=352267, ups=0.71, wpb=496532, bsz=16175.7, num_updates=11100, lr=0.0006003, gnorm=0.237, clip=0, loss_scale=2, train_wall=115, gb_free=21.8, wall=10330 epoch 007: 984 / 1689 loss=3.902, nll_loss=2.385, ppl=5.22, wps=352267, ups=0.71, wpb=496532, bsz=16175.7, num_updates=11100, lr=0.0006003, gnorm=0.237, clip=0, loss_scale=2, train_wall=115, gb_free=21.8, wall=10330 epoch 007: 984 / 1689 loss=3.902, nll_loss=2.385, ppl=5.22, wps=352267, ups=0.71, wpb=496532, bsz=16175.7, num_updates=11100, lr=0.0006003, gnorm=0.237, clip=0, loss_scale=2, train_wall=115, gb_free=21.8, wall=10330 epoch 007: 1084 / 1689 loss=3.901, nll_loss=2.385, ppl=5.22, wps=549012, ups=1.11, wpb=496065, bsz=16892.3, num_updates=11200, lr=0.000597614, gnorm=0.234, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=10420 epoch 007: 1084 / 1689 loss=3.901, nll_loss=2.385, ppl=5.22, wps=549012, ups=1.11, wpb=496065, bsz=16892.3, num_updates=11200, lr=0.000597614, gnorm=0.234, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=10420 epoch 007: 1084 / 1689 loss=3.901, nll_loss=2.385, ppl=5.22, wps=549012, ups=1.11, wpb=496065, bsz=16892.3, num_updates=11200, lr=0.000597614, gnorm=0.234, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=10420 epoch 007: 1084 / 1689 loss=3.901, nll_loss=2.385, ppl=5.22, wps=549012, ups=1.11, wpb=496065, bsz=16892.3, num_updates=11200, lr=0.000597614, gnorm=0.234, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=10420 epoch 007: 1084 / 1689 loss=3.901, nll_loss=2.385, ppl=5.22, wps=549012, ups=1.11, wpb=496065, bsz=16892.3, num_updates=11200, lr=0.000597614, gnorm=0.234, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=10420 epoch 007: 1084 / 1689 loss=3.901, nll_loss=2.385, ppl=5.22, wps=549012, ups=1.11, wpb=496065, bsz=16892.3, num_updates=11200, lr=0.000597614, gnorm=0.234, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=10420 epoch 007: 1084 / 1689 loss=3.901, nll_loss=2.385, ppl=5.22, wps=549012, ups=1.11, wpb=496065, bsz=16892.3, num_updates=11200, lr=0.000597614, gnorm=0.234, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=10420 epoch 007: 1184 / 1689 loss=3.899, nll_loss=2.383, ppl=5.21, wps=549067, ups=1.11, wpb=496073, bsz=16701.8, num_updates=11300, lr=0.000594964, gnorm=0.229, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=10510 epoch 007: 1184 / 1689 loss=3.899, nll_loss=2.383, ppl=5.21, wps=549067, ups=1.11, wpb=496073, bsz=16701.8, num_updates=11300, lr=0.000594964, gnorm=0.229, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=10510 epoch 007: 1184 / 1689 loss=3.899, nll_loss=2.383, ppl=5.21, wps=549067, ups=1.11, wpb=496073, bsz=16701.8, num_updates=11300, lr=0.000594964, gnorm=0.229, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=10510 epoch 007: 1184 / 1689 loss=3.899, nll_loss=2.383, ppl=5.21, wps=549067, ups=1.11, wpb=496073, bsz=16701.8, num_updates=11300, lr=0.000594964, gnorm=0.229, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=10510 epoch 007: 1184 / 1689 loss=3.899, nll_loss=2.383, ppl=5.21, wps=549067, ups=1.11, wpb=496073, bsz=16701.8, num_updates=11300, lr=0.000594964, gnorm=0.229, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=10510 epoch 007: 1184 / 1689 loss=3.899, nll_loss=2.383, ppl=5.21, wps=549067, ups=1.11, wpb=496073, bsz=16701.8, num_updates=11300, lr=0.000594964, gnorm=0.229, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=10510 epoch 007: 1184 / 1689 loss=3.899, nll_loss=2.383, ppl=5.21, wps=549067, ups=1.11, wpb=496073, bsz=16701.8, num_updates=11300, lr=0.000594964, gnorm=0.229, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=10510 epoch 007: 1284 / 1689 loss=3.901, nll_loss=2.385, ppl=5.22, wps=550941, ups=1.11, wpb=495333, bsz=16460.6, num_updates=11400, lr=0.000592349, gnorm=0.227, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=10600 epoch 007: 1284 / 1689 loss=3.901, nll_loss=2.385, ppl=5.22, wps=550941, ups=1.11, wpb=495333, bsz=16460.6, num_updates=11400, lr=0.000592349, gnorm=0.227, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=10600 epoch 007: 1284 / 1689 loss=3.901, nll_loss=2.385, ppl=5.22, wps=550941, ups=1.11, wpb=495333, bsz=16460.6, num_updates=11400, lr=0.000592349, gnorm=0.227, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=10600 epoch 007: 1284 / 1689 loss=3.901, nll_loss=2.385, ppl=5.22, wps=550941, ups=1.11, wpb=495333, bsz=16460.6, num_updates=11400, lr=0.000592349, gnorm=0.227, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=10600 epoch 007: 1284 / 1689 loss=3.901, nll_loss=2.385, ppl=5.22, wps=550941, ups=1.11, wpb=495333, bsz=16460.6, num_updates=11400, lr=0.000592349, gnorm=0.227, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=10600 epoch 007: 1284 / 1689 loss=3.901, nll_loss=2.385, ppl=5.22, wps=550941, ups=1.11, wpb=495333, bsz=16460.6, num_updates=11400, lr=0.000592349, gnorm=0.227, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=10600 epoch 007: 1284 / 1689 loss=3.901, nll_loss=2.385, ppl=5.22, wps=550941, ups=1.11, wpb=495333, bsz=16460.6, num_updates=11400, lr=0.000592349, gnorm=0.227, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=10600 epoch 007: 1384 / 1689 loss=3.904, nll_loss=2.388, ppl=5.24, wps=546630, ups=1.1, wpb=494915, bsz=16422.7, num_updates=11500, lr=0.000589768, gnorm=0.24, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=10691 epoch 007: 1384 / 1689 loss=3.904, nll_loss=2.388, ppl=5.24, wps=546630, ups=1.1, wpb=494915, bsz=16422.7, num_updates=11500, lr=0.000589768, gnorm=0.24, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=10691 epoch 007: 1384 / 1689 loss=3.904, nll_loss=2.388, ppl=5.24, wps=546630, ups=1.1, wpb=494915, bsz=16422.7, num_updates=11500, lr=0.000589768, gnorm=0.24, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=10691 epoch 007: 1384 / 1689 loss=3.904, nll_loss=2.388, ppl=5.24, wps=546630, ups=1.1, wpb=494915, bsz=16422.7, num_updates=11500, lr=0.000589768, gnorm=0.24, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=10691 epoch 007: 1384 / 1689 loss=3.904, nll_loss=2.388, ppl=5.24, wps=546630, ups=1.1, wpb=494915, bsz=16422.7, num_updates=11500, lr=0.000589768, gnorm=0.24, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=10691 epoch 007: 1384 / 1689 loss=3.904, nll_loss=2.388, ppl=5.24, wps=546630, ups=1.1, wpb=494915, bsz=16422.7, num_updates=11500, lr=0.000589768, gnorm=0.24, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=10691 epoch 007: 1384 / 1689 loss=3.904, nll_loss=2.388, ppl=5.24, wps=546630, ups=1.1, wpb=494915, bsz=16422.7, num_updates=11500, lr=0.000589768, gnorm=0.24, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=10691 epoch 007: 1484 / 1689 loss=3.89, nll_loss=2.372, ppl=5.18, wps=548620, ups=1.11, wpb=496070, bsz=16661.9, num_updates=11600, lr=0.00058722, gnorm=0.224, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=10781 epoch 007: 1484 / 1689 loss=3.89, nll_loss=2.372, ppl=5.18, wps=548620, ups=1.11, wpb=496070, bsz=16661.9, num_updates=11600, lr=0.00058722, gnorm=0.224, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=10781 epoch 007: 1484 / 1689 loss=3.89, nll_loss=2.372, ppl=5.18, wps=548620, ups=1.11, wpb=496070, bsz=16661.9, num_updates=11600, lr=0.00058722, gnorm=0.224, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=10781 epoch 007: 1484 / 1689 loss=3.89, nll_loss=2.372, ppl=5.18, wps=548620, ups=1.11, wpb=496070, bsz=16661.9, num_updates=11600, lr=0.00058722, gnorm=0.224, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=10781 epoch 007: 1484 / 1689 loss=3.89, nll_loss=2.372, ppl=5.18, wps=548620, ups=1.11, wpb=496070, bsz=16661.9, num_updates=11600, lr=0.00058722, gnorm=0.224, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=10781 epoch 007: 1484 / 1689 loss=3.89, nll_loss=2.372, ppl=5.18, wps=548620, ups=1.11, wpb=496070, bsz=16661.9, num_updates=11600, lr=0.00058722, gnorm=0.224, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=10781 epoch 007: 1484 / 1689 loss=3.89, nll_loss=2.372, ppl=5.18, wps=548620, ups=1.11, wpb=496070, bsz=16661.9, num_updates=11600, lr=0.00058722, gnorm=0.224, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=10781 epoch 007: 1584 / 1689 loss=3.887, nll_loss=2.37, ppl=5.17, wps=548373, ups=1.11, wpb=495895, bsz=16985.8, num_updates=11700, lr=0.000584705, gnorm=0.22, clip=0, loss_scale=4, train_wall=89, gb_free=22.5, wall=10872 epoch 007: 1584 / 1689 loss=3.887, nll_loss=2.37, ppl=5.17, wps=548373, ups=1.11, wpb=495895, bsz=16985.8, num_updates=11700, lr=0.000584705, gnorm=0.22, clip=0, loss_scale=4, train_wall=89, gb_free=22.5, wall=10872 epoch 007: 1584 / 1689 loss=3.887, nll_loss=2.37, ppl=5.17, wps=548373, ups=1.11, wpb=495895, bsz=16985.8, num_updates=11700, lr=0.000584705, gnorm=0.22, clip=0, loss_scale=4, train_wall=89, gb_free=22.5, wall=10872 epoch 007: 1584 / 1689 loss=3.887, nll_loss=2.37, ppl=5.17, wps=548373, ups=1.11, wpb=495895, bsz=16985.8, num_updates=11700, lr=0.000584705, gnorm=0.22, clip=0, loss_scale=4, train_wall=89, gb_free=22.5, wall=10872 epoch 007: 1584 / 1689 loss=3.887, nll_loss=2.37, ppl=5.17, wps=548373, ups=1.11, wpb=495895, bsz=16985.8, num_updates=11700, lr=0.000584705, gnorm=0.22, clip=0, loss_scale=4, train_wall=89, gb_free=22.5, wall=10872 epoch 007: 1584 / 1689 loss=3.887, nll_loss=2.37, ppl=5.17, wps=548373, ups=1.11, wpb=495895, bsz=16985.8, num_updates=11700, lr=0.000584705, gnorm=0.22, clip=0, loss_scale=4, train_wall=89, gb_free=22.5, wall=10872 epoch 007: 1584 / 1689 loss=3.887, nll_loss=2.37, ppl=5.17, wps=548373, ups=1.11, wpb=495895, bsz=16985.8, num_updates=11700, lr=0.000584705, gnorm=0.22, clip=0, loss_scale=4, train_wall=89, gb_free=22.5, wall=10872 epoch 007: 1684 / 1689 loss=3.888, nll_loss=2.37, ppl=5.17, wps=544815, ups=1.1, wpb=494822, bsz=16502.3, num_updates=11800, lr=0.000582223, gnorm=0.236, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=10962 epoch 007: 1684 / 1689 loss=3.888, nll_loss=2.37, ppl=5.17, wps=544815, ups=1.1, wpb=494822, bsz=16502.3, num_updates=11800, lr=0.000582223, gnorm=0.236, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=10962 epoch 007: 1684 / 1689 loss=3.888, nll_loss=2.37, ppl=5.17, wps=544815, ups=1.1, wpb=494822, bsz=16502.3, num_updates=11800, lr=0.000582223, gnorm=0.236, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=10962 epoch 007: 1684 / 1689 loss=3.888, nll_loss=2.37, ppl=5.17, wps=544815, ups=1.1, wpb=494822, bsz=16502.3, num_updates=11800, lr=0.000582223, gnorm=0.236, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=10962 epoch 007: 1684 / 1689 loss=3.888, nll_loss=2.37, ppl=5.17, wps=544815, ups=1.1, wpb=494822, bsz=16502.3, num_updates=11800, lr=0.000582223, gnorm=0.236, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=10962 epoch 007: 1684 / 1689 loss=3.888, nll_loss=2.37, ppl=5.17, wps=544815, ups=1.1, wpb=494822, bsz=16502.3, num_updates=11800, lr=0.000582223, gnorm=0.236, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=10962 epoch 007: 1684 / 1689 loss=3.888, nll_loss=2.37, ppl=5.17, wps=544815, ups=1.1, wpb=494822, bsz=16502.3, num_updates=11800, lr=0.000582223, gnorm=0.236, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=10962 end of epoch 7 (average epoch stats below) epoch 007 | loss 3.901 | nll_loss 2.385 | ppl 5.22 | wps 529958 | ups 1.07 | wpb 495113 | bsz 16501.2 | num_updates 11805 | lr 0.000582099 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 1525 | gb_free 23 | wall 10966 epoch 007 | loss 3.901 | nll_loss 2.385 | ppl 5.22 | wps 529958 | ups 1.07 | wpb 495113 | bsz 16501.2 | num_updates 11805 | lr 0.000582099 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 1525 | gb_free 23 | wall 10966 epoch 007 | loss 3.901 | nll_loss 2.385 | ppl 5.22 | wps 529958 | ups 1.07 | wpb 495113 | bsz 16501.2 | num_updates 11805 | lr 0.000582099 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 1525 | gb_free 23 | wall 10966 epoch 007 | loss 3.901 | nll_loss 2.385 | ppl 5.22 | wps 529958 | ups 1.07 | wpb 495113 | bsz 16501.2 | num_updates 11805 | lr 0.000582099 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 1525 | gb_free 23 | wall 10966 epoch 007 | loss 3.901 | nll_loss 2.385 | ppl 5.22 | wps 529958 | ups 1.07 | wpb 495113 | bsz 16501.2 | num_updates 11805 | lr 0.000582099 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 1525 | gb_free 23 | wall 10966 epoch 007 | loss 3.901 | nll_loss 2.385 | ppl 5.22 | wps 529958 | ups 1.07 | wpb 495113 | bsz 16501.2 | num_updates 11805 | lr 0.000582099 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 1525 | gb_free 23 | wall 10966 epoch 007 | loss 3.901 | nll_loss 2.385 | ppl 5.22 | wps 529958 | ups 1.07 | wpb 495113 | bsz 16501.2 | num_updates 11805 | lr 0.000582099 | gnorm 0.236 | clip 0 | loss_scale 4 | train_wall 1525 | gb_free 23 | wall 10966 Start iterating over samples epoch 008: 95 / 1689 loss=3.866, nll_loss=2.345, ppl=5.08, wps=541493, ups=1.1, wpb=491425, bsz=16580.3, num_updates=11900, lr=0.000579771, gnorm=0.241, clip=0, loss_scale=4, train_wall=88, gb_free=20.3, wall=11053 epoch 008: 95 / 1689 loss=3.866, nll_loss=2.345, ppl=5.08, wps=541493, ups=1.1, wpb=491425, bsz=16580.3, num_updates=11900, lr=0.000579771, gnorm=0.241, clip=0, loss_scale=4, train_wall=88, gb_free=20.3, wall=11053 epoch 008: 95 / 1689 loss=3.866, nll_loss=2.345, ppl=5.08, wps=541493, ups=1.1, wpb=491425, bsz=16580.3, num_updates=11900, lr=0.000579771, gnorm=0.241, clip=0, loss_scale=4, train_wall=88, gb_free=20.3, wall=11053 epoch 008: 95 / 1689 loss=3.866, nll_loss=2.345, ppl=5.08, wps=541493, ups=1.1, wpb=491425, bsz=16580.3, num_updates=11900, lr=0.000579771, gnorm=0.241, clip=0, loss_scale=4, train_wall=88, gb_free=20.3, wall=11053 epoch 008: 95 / 1689 loss=3.866, nll_loss=2.345, ppl=5.08, wps=541493, ups=1.1, wpb=491425, bsz=16580.3, num_updates=11900, lr=0.000579771, gnorm=0.241, clip=0, loss_scale=4, train_wall=88, gb_free=20.3, wall=11053 epoch 008: 95 / 1689 loss=3.866, nll_loss=2.345, ppl=5.08, wps=541493, ups=1.1, wpb=491425, bsz=16580.3, num_updates=11900, lr=0.000579771, gnorm=0.241, clip=0, loss_scale=4, train_wall=88, gb_free=20.3, wall=11053 epoch 008: 95 / 1689 loss=3.866, nll_loss=2.345, ppl=5.08, wps=541493, ups=1.1, wpb=491425, bsz=16580.3, num_updates=11900, lr=0.000579771, gnorm=0.241, clip=0, loss_scale=4, train_wall=88, gb_free=20.3, wall=11053 epoch 008: 95 / 1689 loss=3.866, nll_loss=2.345, ppl=5.08, wps=541493, ups=1.1, wpb=491425, bsz=16580.3, num_updates=11900, lr=0.000579771, gnorm=0.241, clip=0, loss_scale=4, train_wall=88, gb_free=20.3, wall=11053 epoch 008: 195 / 1689 loss=3.869, nll_loss=2.349, ppl=5.09, wps=547208, ups=1.11, wpb=494513, bsz=16693.9, num_updates=12000, lr=0.00057735, gnorm=0.24, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=11144 epoch 008: 195 / 1689 loss=3.869, nll_loss=2.349, ppl=5.09, wps=547208, ups=1.11, wpb=494513, bsz=16693.9, num_updates=12000, lr=0.00057735, gnorm=0.24, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=11144 epoch 008: 195 / 1689 loss=3.869, nll_loss=2.349, ppl=5.09, wps=547208, ups=1.11, wpb=494513, bsz=16693.9, num_updates=12000, lr=0.00057735, gnorm=0.24, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=11144 epoch 008: 195 / 1689 loss=3.869, nll_loss=2.349, ppl=5.09, wps=547208, ups=1.11, wpb=494513, bsz=16693.9, num_updates=12000, lr=0.00057735, gnorm=0.24, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=11144 epoch 008: 195 / 1689 loss=3.869, nll_loss=2.349, ppl=5.09, wps=547208, ups=1.11, wpb=494513, bsz=16693.9, num_updates=12000, lr=0.00057735, gnorm=0.24, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=11144 epoch 008: 195 / 1689 loss=3.869, nll_loss=2.349, ppl=5.09, wps=547208, ups=1.11, wpb=494513, bsz=16693.9, num_updates=12000, lr=0.00057735, gnorm=0.24, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=11144 epoch 008: 195 / 1689 loss=3.869, nll_loss=2.349, ppl=5.09, wps=547208, ups=1.11, wpb=494513, bsz=16693.9, num_updates=12000, lr=0.00057735, gnorm=0.24, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=11144 epoch 008: 195 / 1689 loss=3.869, nll_loss=2.349, ppl=5.09, wps=547208, ups=1.11, wpb=494513, bsz=16693.9, num_updates=12000, lr=0.00057735, gnorm=0.24, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=11144 begin validation on "valid" subset epoch 008 | valid on 'valid' subset | loss 3.849 | nll_loss 2.268 | ppl 4.82 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.849 epoch 008 | valid on 'valid' subset | loss 3.849 | nll_loss 2.268 | ppl 4.82 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.849 epoch 008 | valid on 'valid' subset | loss 3.849 | nll_loss 2.268 | ppl 4.82 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.849 epoch 008 | valid on 'valid' subset | loss 3.849 | nll_loss 2.268 | ppl 4.82 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.849 epoch 008 | valid on 'valid' subset | loss 3.849 | nll_loss 2.268 | ppl 4.82 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.849 epoch 008 | valid on 'valid' subset | loss 3.849 | nll_loss 2.268 | ppl 4.82 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.849 epoch 008 | valid on 'valid' subset | loss 3.849 | nll_loss 2.268 | ppl 4.82 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.849 epoch 008 | valid on 'valid' subset | loss 3.849 | nll_loss 2.268 | ppl 4.82 | wps 0 | wpb 44526 | bsz 2008 | num_updates 12000 | best_loss 3.849 epoch 008: 295 / 1689 loss=3.871, nll_loss=2.352, ppl=5.1, wps=451893, ups=0.91, wpb=495319, bsz=16517.9, num_updates=12100, lr=0.00057496, gnorm=0.23, clip=0, loss_scale=4, train_wall=89, gb_free=20.4, wall=11253 epoch 008: 295 / 1689 loss=3.871, nll_loss=2.352, ppl=5.1, wps=451893, ups=0.91, wpb=495319, bsz=16517.9, num_updates=12100, lr=0.00057496, gnorm=0.23, clip=0, loss_scale=4, train_wall=89, gb_free=20.4, wall=11253 epoch 008: 295 / 1689 loss=3.871, nll_loss=2.352, ppl=5.1, wps=451893, ups=0.91, wpb=495319, bsz=16517.9, num_updates=12100, lr=0.00057496, gnorm=0.23, clip=0, loss_scale=4, train_wall=89, gb_free=20.4, wall=11253 epoch 008: 295 / 1689 loss=3.871, nll_loss=2.352, ppl=5.1, wps=451893, ups=0.91, wpb=495319, bsz=16517.9, num_updates=12100, lr=0.00057496, gnorm=0.23, clip=0, loss_scale=4, train_wall=89, gb_free=20.4, wall=11253 epoch 008: 295 / 1689 loss=3.871, nll_loss=2.352, ppl=5.1, wps=451893, ups=0.91, wpb=495319, bsz=16517.9, num_updates=12100, lr=0.00057496, gnorm=0.23, clip=0, loss_scale=4, train_wall=89, gb_free=20.4, wall=11253 epoch 008: 295 / 1689 loss=3.871, nll_loss=2.352, ppl=5.1, wps=451893, ups=0.91, wpb=495319, bsz=16517.9, num_updates=12100, lr=0.00057496, gnorm=0.23, clip=0, loss_scale=4, train_wall=89, gb_free=20.4, wall=11253 epoch 008: 295 / 1689 loss=3.871, nll_loss=2.352, ppl=5.1, wps=451893, ups=0.91, wpb=495319, bsz=16517.9, num_updates=12100, lr=0.00057496, gnorm=0.23, clip=0, loss_scale=4, train_wall=89, gb_free=20.4, wall=11253 epoch 008: 295 / 1689 loss=3.871, nll_loss=2.352, ppl=5.1, wps=451893, ups=0.91, wpb=495319, bsz=16517.9, num_updates=12100, lr=0.00057496, gnorm=0.23, clip=0, loss_scale=4, train_wall=89, gb_free=20.4, wall=11253 epoch 008: 396 / 1689 loss=3.867, nll_loss=2.347, ppl=5.09, wps=546065, ups=1.1, wpb=495309, bsz=16677.4, num_updates=12200, lr=0.000572598, gnorm=0.224, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=11344 epoch 008: 396 / 1689 loss=3.867, nll_loss=2.347, ppl=5.09, wps=546065, ups=1.1, wpb=495309, bsz=16677.4, num_updates=12200, lr=0.000572598, gnorm=0.224, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=11344 epoch 008: 396 / 1689 loss=3.867, nll_loss=2.347, ppl=5.09, wps=546065, ups=1.1, wpb=495309, bsz=16677.4, num_updates=12200, lr=0.000572598, gnorm=0.224, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=11344 epoch 008: 396 / 1689 loss=3.867, nll_loss=2.347, ppl=5.09, wps=546065, ups=1.1, wpb=495309, bsz=16677.4, num_updates=12200, lr=0.000572598, gnorm=0.224, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=11344 epoch 008: 396 / 1689 loss=3.867, nll_loss=2.347, ppl=5.09, wps=546065, ups=1.1, wpb=495309, bsz=16677.4, num_updates=12200, lr=0.000572598, gnorm=0.224, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=11344 epoch 008: 396 / 1689 loss=3.867, nll_loss=2.347, ppl=5.09, wps=546065, ups=1.1, wpb=495309, bsz=16677.4, num_updates=12200, lr=0.000572598, gnorm=0.224, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=11344 epoch 008: 396 / 1689 loss=3.867, nll_loss=2.347, ppl=5.09, wps=546065, ups=1.1, wpb=495309, bsz=16677.4, num_updates=12200, lr=0.000572598, gnorm=0.224, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=11344 epoch 008: 396 / 1689 loss=3.867, nll_loss=2.347, ppl=5.09, wps=546065, ups=1.1, wpb=495309, bsz=16677.4, num_updates=12200, lr=0.000572598, gnorm=0.224, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=11344 epoch 008: 496 / 1689 loss=3.879, nll_loss=2.36, ppl=5.13, wps=555547, ups=1.13, wpb=493323, bsz=15931, num_updates=12300, lr=0.000570266, gnorm=0.221, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=11433 epoch 008: 496 / 1689 loss=3.879, nll_loss=2.36, ppl=5.13, wps=555547, ups=1.13, wpb=493323, bsz=15931, num_updates=12300, lr=0.000570266, gnorm=0.221, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=11433 epoch 008: 496 / 1689 loss=3.879, nll_loss=2.36, ppl=5.13, wps=555547, ups=1.13, wpb=493323, bsz=15931, num_updates=12300, lr=0.000570266, gnorm=0.221, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=11433 epoch 008: 496 / 1689 loss=3.879, nll_loss=2.36, ppl=5.13, wps=555547, ups=1.13, wpb=493323, bsz=15931, num_updates=12300, lr=0.000570266, gnorm=0.221, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=11433 epoch 008: 496 / 1689 loss=3.879, nll_loss=2.36, ppl=5.13, wps=555547, ups=1.13, wpb=493323, bsz=15931, num_updates=12300, lr=0.000570266, gnorm=0.221, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=11433 epoch 008: 496 / 1689 loss=3.879, nll_loss=2.36, ppl=5.13, wps=555547, ups=1.13, wpb=493323, bsz=15931, num_updates=12300, lr=0.000570266, gnorm=0.221, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=11433 epoch 008: 496 / 1689 loss=3.879, nll_loss=2.36, ppl=5.13, wps=555547, ups=1.13, wpb=493323, bsz=15931, num_updates=12300, lr=0.000570266, gnorm=0.221, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=11433 epoch 008: 496 / 1689 loss=3.879, nll_loss=2.36, ppl=5.13, wps=555547, ups=1.13, wpb=493323, bsz=15931, num_updates=12300, lr=0.000570266, gnorm=0.221, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=11433 epoch 008: 596 / 1689 loss=3.88, nll_loss=2.362, ppl=5.14, wps=548538, ups=1.11, wpb=495205, bsz=16471.3, num_updates=12400, lr=0.000567962, gnorm=0.222, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=11523 epoch 008: 596 / 1689 loss=3.88, nll_loss=2.362, ppl=5.14, wps=548538, ups=1.11, wpb=495205, bsz=16471.3, num_updates=12400, lr=0.000567962, gnorm=0.222, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=11523 epoch 008: 596 / 1689 loss=3.88, nll_loss=2.362, ppl=5.14, wps=548538, ups=1.11, wpb=495205, bsz=16471.3, num_updates=12400, lr=0.000567962, gnorm=0.222, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=11523 epoch 008: 596 / 1689 loss=3.88, nll_loss=2.362, ppl=5.14, wps=548538, ups=1.11, wpb=495205, bsz=16471.3, num_updates=12400, lr=0.000567962, gnorm=0.222, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=11523 epoch 008: 596 / 1689 loss=3.88, nll_loss=2.362, ppl=5.14, wps=548538, ups=1.11, wpb=495205, bsz=16471.3, num_updates=12400, lr=0.000567962, gnorm=0.222, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=11523 epoch 008: 596 / 1689 loss=3.88, nll_loss=2.362, ppl=5.14, wps=548538, ups=1.11, wpb=495205, bsz=16471.3, num_updates=12400, lr=0.000567962, gnorm=0.222, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=11523 epoch 008: 596 / 1689 loss=3.88, nll_loss=2.362, ppl=5.14, wps=548538, ups=1.11, wpb=495205, bsz=16471.3, num_updates=12400, lr=0.000567962, gnorm=0.222, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=11523 epoch 008: 596 / 1689 loss=3.88, nll_loss=2.362, ppl=5.14, wps=548538, ups=1.11, wpb=495205, bsz=16471.3, num_updates=12400, lr=0.000567962, gnorm=0.222, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=11523 epoch 008: 696 / 1689 loss=3.87, nll_loss=2.351, ppl=5.1, wps=548664, ups=1.11, wpb=495721, bsz=16588.1, num_updates=12500, lr=0.000565685, gnorm=0.241, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=11613 epoch 008: 696 / 1689 loss=3.87, nll_loss=2.351, ppl=5.1, wps=548664, ups=1.11, wpb=495721, bsz=16588.1, num_updates=12500, lr=0.000565685, gnorm=0.241, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=11613 epoch 008: 696 / 1689 loss=3.87, nll_loss=2.351, ppl=5.1, wps=548664, ups=1.11, wpb=495721, bsz=16588.1, num_updates=12500, lr=0.000565685, gnorm=0.241, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=11613 epoch 008: 696 / 1689 loss=3.87, nll_loss=2.351, ppl=5.1, wps=548664, ups=1.11, wpb=495721, bsz=16588.1, num_updates=12500, lr=0.000565685, gnorm=0.241, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=11613 epoch 008: 696 / 1689 loss=3.87, nll_loss=2.351, ppl=5.1, wps=548664, ups=1.11, wpb=495721, bsz=16588.1, num_updates=12500, lr=0.000565685, gnorm=0.241, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=11613 epoch 008: 696 / 1689 loss=3.87, nll_loss=2.351, ppl=5.1, wps=548664, ups=1.11, wpb=495721, bsz=16588.1, num_updates=12500, lr=0.000565685, gnorm=0.241, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=11613 epoch 008: 696 / 1689 loss=3.87, nll_loss=2.351, ppl=5.1, wps=548664, ups=1.11, wpb=495721, bsz=16588.1, num_updates=12500, lr=0.000565685, gnorm=0.241, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=11613 epoch 008: 696 / 1689 loss=3.87, nll_loss=2.351, ppl=5.1, wps=548664, ups=1.11, wpb=495721, bsz=16588.1, num_updates=12500, lr=0.000565685, gnorm=0.241, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=11613 epoch 008: 796 / 1689 loss=3.876, nll_loss=2.358, ppl=5.13, wps=545592, ups=1.1, wpb=495949, bsz=16405, num_updates=12600, lr=0.000563436, gnorm=0.221, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=11704 epoch 008: 796 / 1689 loss=3.876, nll_loss=2.358, ppl=5.13, wps=545592, ups=1.1, wpb=495949, bsz=16405, num_updates=12600, lr=0.000563436, gnorm=0.221, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=11704 epoch 008: 796 / 1689 loss=3.876, nll_loss=2.358, ppl=5.13, wps=545592, ups=1.1, wpb=495949, bsz=16405, num_updates=12600, lr=0.000563436, gnorm=0.221, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=11704 epoch 008: 796 / 1689 loss=3.876, nll_loss=2.358, ppl=5.13, wps=545592, ups=1.1, wpb=495949, bsz=16405, num_updates=12600, lr=0.000563436, gnorm=0.221, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=11704 epoch 008: 796 / 1689 loss=3.876, nll_loss=2.358, ppl=5.13, wps=545592, ups=1.1, wpb=495949, bsz=16405, num_updates=12600, lr=0.000563436, gnorm=0.221, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=11704 epoch 008: 796 / 1689 loss=3.876, nll_loss=2.358, ppl=5.13, wps=545592, ups=1.1, wpb=495949, bsz=16405, num_updates=12600, lr=0.000563436, gnorm=0.221, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=11704 epoch 008: 796 / 1689 loss=3.876, nll_loss=2.358, ppl=5.13, wps=545592, ups=1.1, wpb=495949, bsz=16405, num_updates=12600, lr=0.000563436, gnorm=0.221, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=11704 epoch 008: 796 / 1689 loss=3.876, nll_loss=2.358, ppl=5.13, wps=545592, ups=1.1, wpb=495949, bsz=16405, num_updates=12600, lr=0.000563436, gnorm=0.221, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=11704 epoch 008: 896 / 1689 loss=3.872, nll_loss=2.353, ppl=5.11, wps=552732, ups=1.11, wpb=495958, bsz=16633.4, num_updates=12700, lr=0.000561214, gnorm=0.229, clip=0, loss_scale=8, train_wall=89, gb_free=22.2, wall=11794 epoch 008: 896 / 1689 loss=3.872, nll_loss=2.353, ppl=5.11, wps=552732, ups=1.11, wpb=495958, bsz=16633.4, num_updates=12700, lr=0.000561214, gnorm=0.229, clip=0, loss_scale=8, train_wall=89, gb_free=22.2, wall=11794 epoch 008: 896 / 1689 loss=3.872, nll_loss=2.353, ppl=5.11, wps=552732, ups=1.11, wpb=495958, bsz=16633.4, num_updates=12700, lr=0.000561214, gnorm=0.229, clip=0, loss_scale=8, train_wall=89, gb_free=22.2, wall=11794 epoch 008: 896 / 1689 loss=3.872, nll_loss=2.353, ppl=5.11, wps=552732, ups=1.11, wpb=495958, bsz=16633.4, num_updates=12700, lr=0.000561214, gnorm=0.229, clip=0, loss_scale=8, train_wall=89, gb_free=22.2, wall=11794 epoch 008: 896 / 1689 loss=3.872, nll_loss=2.353, ppl=5.11, wps=552732, ups=1.11, wpb=495958, bsz=16633.4, num_updates=12700, lr=0.000561214, gnorm=0.229, clip=0, loss_scale=8, train_wall=89, gb_free=22.2, wall=11794 epoch 008: 896 / 1689 loss=3.872, nll_loss=2.353, ppl=5.11, wps=552732, ups=1.11, wpb=495958, bsz=16633.4, num_updates=12700, lr=0.000561214, gnorm=0.229, clip=0, loss_scale=8, train_wall=89, gb_free=22.2, wall=11794 epoch 008: 896 / 1689 loss=3.872, nll_loss=2.353, ppl=5.11, wps=552732, ups=1.11, wpb=495958, bsz=16633.4, num_updates=12700, lr=0.000561214, gnorm=0.229, clip=0, loss_scale=8, train_wall=89, gb_free=22.2, wall=11794 epoch 008: 896 / 1689 loss=3.872, nll_loss=2.353, ppl=5.11, wps=552732, ups=1.11, wpb=495958, bsz=16633.4, num_updates=12700, lr=0.000561214, gnorm=0.229, clip=0, loss_scale=8, train_wall=89, gb_free=22.2, wall=11794 epoch 008: 997 / 1689 loss=3.876, nll_loss=2.358, ppl=5.13, wps=548351, ups=1.11, wpb=494241, bsz=16500.6, num_updates=12800, lr=0.000559017, gnorm=0.221, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=11884 epoch 008: 997 / 1689 loss=3.876, nll_loss=2.358, ppl=5.13, wps=548351, ups=1.11, wpb=494241, bsz=16500.6, num_updates=12800, lr=0.000559017, gnorm=0.221, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=11884 epoch 008: 997 / 1689 loss=3.876, nll_loss=2.358, ppl=5.13, wps=548351, ups=1.11, wpb=494241, bsz=16500.6, num_updates=12800, lr=0.000559017, gnorm=0.221, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=11884 epoch 008: 997 / 1689 loss=3.876, nll_loss=2.358, ppl=5.13, wps=548351, ups=1.11, wpb=494241, bsz=16500.6, num_updates=12800, lr=0.000559017, gnorm=0.221, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=11884 epoch 008: 997 / 1689 loss=3.876, nll_loss=2.358, ppl=5.13, wps=548351, ups=1.11, wpb=494241, bsz=16500.6, num_updates=12800, lr=0.000559017, gnorm=0.221, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=11884 epoch 008: 997 / 1689 loss=3.876, nll_loss=2.358, ppl=5.13, wps=548351, ups=1.11, wpb=494241, bsz=16500.6, num_updates=12800, lr=0.000559017, gnorm=0.221, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=11884 epoch 008: 997 / 1689 loss=3.876, nll_loss=2.358, ppl=5.13, wps=548351, ups=1.11, wpb=494241, bsz=16500.6, num_updates=12800, lr=0.000559017, gnorm=0.221, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=11884 epoch 008: 997 / 1689 loss=3.876, nll_loss=2.358, ppl=5.13, wps=548351, ups=1.11, wpb=494241, bsz=16500.6, num_updates=12800, lr=0.000559017, gnorm=0.221, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=11884 epoch 008: 1097 / 1689 loss=3.865, nll_loss=2.346, ppl=5.08, wps=557168, ups=1.13, wpb=494973, bsz=16402.6, num_updates=12900, lr=0.000556846, gnorm=0.215, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=11973 epoch 008: 1097 / 1689 loss=3.865, nll_loss=2.346, ppl=5.08, wps=557168, ups=1.13, wpb=494973, bsz=16402.6, num_updates=12900, lr=0.000556846, gnorm=0.215, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=11973 epoch 008: 1097 / 1689 loss=3.865, nll_loss=2.346, ppl=5.08, wps=557168, ups=1.13, wpb=494973, bsz=16402.6, num_updates=12900, lr=0.000556846, gnorm=0.215, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=11973 epoch 008: 1097 / 1689 loss=3.865, nll_loss=2.346, ppl=5.08, wps=557168, ups=1.13, wpb=494973, bsz=16402.6, num_updates=12900, lr=0.000556846, gnorm=0.215, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=11973 epoch 008: 1097 / 1689 loss=3.865, nll_loss=2.346, ppl=5.08, wps=557168, ups=1.13, wpb=494973, bsz=16402.6, num_updates=12900, lr=0.000556846, gnorm=0.215, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=11973 epoch 008: 1097 / 1689 loss=3.865, nll_loss=2.346, ppl=5.08, wps=557168, ups=1.13, wpb=494973, bsz=16402.6, num_updates=12900, lr=0.000556846, gnorm=0.215, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=11973 epoch 008: 1097 / 1689 loss=3.865, nll_loss=2.346, ppl=5.08, wps=557168, ups=1.13, wpb=494973, bsz=16402.6, num_updates=12900, lr=0.000556846, gnorm=0.215, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=11973 epoch 008: 1097 / 1689 loss=3.865, nll_loss=2.346, ppl=5.08, wps=557168, ups=1.13, wpb=494973, bsz=16402.6, num_updates=12900, lr=0.000556846, gnorm=0.215, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=11973 epoch 008: 1197 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=556676, ups=1.12, wpb=497463, bsz=16235.4, num_updates=13000, lr=0.0005547, gnorm=0.224, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=12062 epoch 008: 1197 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=556676, ups=1.12, wpb=497463, bsz=16235.4, num_updates=13000, lr=0.0005547, gnorm=0.224, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=12062 epoch 008: 1197 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=556676, ups=1.12, wpb=497463, bsz=16235.4, num_updates=13000, lr=0.0005547, gnorm=0.224, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=12062 epoch 008: 1197 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=556676, ups=1.12, wpb=497463, bsz=16235.4, num_updates=13000, lr=0.0005547, gnorm=0.224, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=12062 epoch 008: 1197 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=556676, ups=1.12, wpb=497463, bsz=16235.4, num_updates=13000, lr=0.0005547, gnorm=0.224, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=12062 epoch 008: 1197 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=556676, ups=1.12, wpb=497463, bsz=16235.4, num_updates=13000, lr=0.0005547, gnorm=0.224, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=12062 epoch 008: 1197 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=556676, ups=1.12, wpb=497463, bsz=16235.4, num_updates=13000, lr=0.0005547, gnorm=0.224, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=12062 epoch 008: 1197 / 1689 loss=3.864, nll_loss=2.345, ppl=5.08, wps=556676, ups=1.12, wpb=497463, bsz=16235.4, num_updates=13000, lr=0.0005547, gnorm=0.224, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=12062 begin validation on "valid" subset epoch 008 | valid on 'valid' subset | loss 3.826 | nll_loss 2.252 | ppl 4.76 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.826 epoch 008 | valid on 'valid' subset | loss 3.826 | nll_loss 2.252 | ppl 4.76 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.826 epoch 008 | valid on 'valid' subset | loss 3.826 | nll_loss 2.252 | ppl 4.76 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.826 epoch 008 | valid on 'valid' subset | loss 3.826 | nll_loss 2.252 | ppl 4.76 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.826 epoch 008 | valid on 'valid' subset | loss 3.826 | nll_loss 2.252 | ppl 4.76 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.826 epoch 008 | valid on 'valid' subset | loss 3.826 | nll_loss 2.252 | ppl 4.76 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.826 epoch 008 | valid on 'valid' subset | loss 3.826 | nll_loss 2.252 | ppl 4.76 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.826 epoch 008 | valid on 'valid' subset | loss 3.826 | nll_loss 2.252 | ppl 4.76 | wps 0 | wpb 44526 | bsz 2008 | num_updates 13000 | best_loss 3.826 epoch 008: 1297 / 1689 loss=3.863, nll_loss=2.344, ppl=5.08, wps=385330, ups=0.78, wpb=495386, bsz=16704.1, num_updates=13100, lr=0.000552579, gnorm=0.21, clip=0, loss_scale=4, train_wall=87, gb_free=21, wall=12191 epoch 008: 1297 / 1689 loss=3.863, nll_loss=2.344, ppl=5.08, wps=385330, ups=0.78, wpb=495386, bsz=16704.1, num_updates=13100, lr=0.000552579, gnorm=0.21, clip=0, loss_scale=4, train_wall=87, gb_free=21, wall=12191 epoch 008: 1297 / 1689 loss=3.863, nll_loss=2.344, ppl=5.08, wps=385330, ups=0.78, wpb=495386, bsz=16704.1, num_updates=13100, lr=0.000552579, gnorm=0.21, clip=0, loss_scale=4, train_wall=87, gb_free=21, wall=12191 epoch 008: 1297 / 1689 loss=3.863, nll_loss=2.344, ppl=5.08, wps=385330, ups=0.78, wpb=495386, bsz=16704.1, num_updates=13100, lr=0.000552579, gnorm=0.21, clip=0, loss_scale=4, train_wall=87, gb_free=21, wall=12191 epoch 008: 1297 / 1689 loss=3.863, nll_loss=2.344, ppl=5.08, wps=385330, ups=0.78, wpb=495386, bsz=16704.1, num_updates=13100, lr=0.000552579, gnorm=0.21, clip=0, loss_scale=4, train_wall=87, gb_free=21, wall=12191 epoch 008: 1297 / 1689 loss=3.863, nll_loss=2.344, ppl=5.08, wps=385330, ups=0.78, wpb=495386, bsz=16704.1, num_updates=13100, lr=0.000552579, gnorm=0.21, clip=0, loss_scale=4, train_wall=87, gb_free=21, wall=12191 epoch 008: 1297 / 1689 loss=3.863, nll_loss=2.344, ppl=5.08, wps=385330, ups=0.78, wpb=495386, bsz=16704.1, num_updates=13100, lr=0.000552579, gnorm=0.21, clip=0, loss_scale=4, train_wall=87, gb_free=21, wall=12191 epoch 008: 1297 / 1689 loss=3.863, nll_loss=2.344, ppl=5.08, wps=385330, ups=0.78, wpb=495386, bsz=16704.1, num_updates=13100, lr=0.000552579, gnorm=0.21, clip=0, loss_scale=4, train_wall=87, gb_free=21, wall=12191 epoch 008: 1397 / 1689 loss=3.867, nll_loss=2.348, ppl=5.09, wps=564195, ups=1.14, wpb=496355, bsz=16590.8, num_updates=13200, lr=0.000550482, gnorm=0.233, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=12279 epoch 008: 1397 / 1689 loss=3.867, nll_loss=2.348, ppl=5.09, wps=564195, ups=1.14, wpb=496355, bsz=16590.8, num_updates=13200, lr=0.000550482, gnorm=0.233, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=12279 epoch 008: 1397 / 1689 loss=3.867, nll_loss=2.348, ppl=5.09, wps=564195, ups=1.14, wpb=496355, bsz=16590.8, num_updates=13200, lr=0.000550482, gnorm=0.233, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=12279 epoch 008: 1397 / 1689 loss=3.867, nll_loss=2.348, ppl=5.09, wps=564195, ups=1.14, wpb=496355, bsz=16590.8, num_updates=13200, lr=0.000550482, gnorm=0.233, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=12279 epoch 008: 1397 / 1689 loss=3.867, nll_loss=2.348, ppl=5.09, wps=564195, ups=1.14, wpb=496355, bsz=16590.8, num_updates=13200, lr=0.000550482, gnorm=0.233, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=12279 epoch 008: 1397 / 1689 loss=3.867, nll_loss=2.348, ppl=5.09, wps=564195, ups=1.14, wpb=496355, bsz=16590.8, num_updates=13200, lr=0.000550482, gnorm=0.233, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=12279 epoch 008: 1397 / 1689 loss=3.867, nll_loss=2.348, ppl=5.09, wps=564195, ups=1.14, wpb=496355, bsz=16590.8, num_updates=13200, lr=0.000550482, gnorm=0.233, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=12279 epoch 008: 1397 / 1689 loss=3.867, nll_loss=2.348, ppl=5.09, wps=564195, ups=1.14, wpb=496355, bsz=16590.8, num_updates=13200, lr=0.000550482, gnorm=0.233, clip=0, loss_scale=4, train_wall=87, gb_free=21.8, wall=12279 epoch 008: 1497 / 1689 loss=3.872, nll_loss=2.354, ppl=5.11, wps=552277, ups=1.11, wpb=495317, bsz=16443.7, num_updates=13300, lr=0.000548408, gnorm=0.212, clip=0, loss_scale=8, train_wall=89, gb_free=22, wall=12368 epoch 008: 1497 / 1689 loss=3.872, nll_loss=2.354, ppl=5.11, wps=552277, ups=1.11, wpb=495317, bsz=16443.7, num_updates=13300, lr=0.000548408, gnorm=0.212, clip=0, loss_scale=8, train_wall=89, gb_free=22, wall=12368 epoch 008: 1497 / 1689 loss=3.872, nll_loss=2.354, ppl=5.11, wps=552277, ups=1.11, wpb=495317, bsz=16443.7, num_updates=13300, lr=0.000548408, gnorm=0.212, clip=0, loss_scale=8, train_wall=89, gb_free=22, wall=12368 epoch 008: 1497 / 1689 loss=3.872, nll_loss=2.354, ppl=5.11, wps=552277, ups=1.11, wpb=495317, bsz=16443.7, num_updates=13300, lr=0.000548408, gnorm=0.212, clip=0, loss_scale=8, train_wall=89, gb_free=22, wall=12368 epoch 008: 1497 / 1689 loss=3.872, nll_loss=2.354, ppl=5.11, wps=552277, ups=1.11, wpb=495317, bsz=16443.7, num_updates=13300, lr=0.000548408, gnorm=0.212, clip=0, loss_scale=8, train_wall=89, gb_free=22, wall=12368 epoch 008: 1497 / 1689 loss=3.872, nll_loss=2.354, ppl=5.11, wps=552277, ups=1.11, wpb=495317, bsz=16443.7, num_updates=13300, lr=0.000548408, gnorm=0.212, clip=0, loss_scale=8, train_wall=89, gb_free=22, wall=12368 epoch 008: 1497 / 1689 loss=3.872, nll_loss=2.354, ppl=5.11, wps=552277, ups=1.11, wpb=495317, bsz=16443.7, num_updates=13300, lr=0.000548408, gnorm=0.212, clip=0, loss_scale=8, train_wall=89, gb_free=22, wall=12368 epoch 008: 1497 / 1689 loss=3.872, nll_loss=2.354, ppl=5.11, wps=552277, ups=1.11, wpb=495317, bsz=16443.7, num_updates=13300, lr=0.000548408, gnorm=0.212, clip=0, loss_scale=8, train_wall=89, gb_free=22, wall=12368 epoch 008: 1598 / 1689 loss=3.87, nll_loss=2.352, ppl=5.1, wps=549014, ups=1.11, wpb=494464, bsz=16550.7, num_updates=13400, lr=0.000546358, gnorm=0.231, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=12459 epoch 008: 1598 / 1689 loss=3.87, nll_loss=2.352, ppl=5.1, wps=549014, ups=1.11, wpb=494464, bsz=16550.7, num_updates=13400, lr=0.000546358, gnorm=0.231, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=12459 epoch 008: 1598 / 1689 loss=3.87, nll_loss=2.352, ppl=5.1, wps=549014, ups=1.11, wpb=494464, bsz=16550.7, num_updates=13400, lr=0.000546358, gnorm=0.231, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=12459 epoch 008: 1598 / 1689 loss=3.87, nll_loss=2.352, ppl=5.1, wps=549014, ups=1.11, wpb=494464, bsz=16550.7, num_updates=13400, lr=0.000546358, gnorm=0.231, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=12459 epoch 008: 1598 / 1689 loss=3.87, nll_loss=2.352, ppl=5.1, wps=549014, ups=1.11, wpb=494464, bsz=16550.7, num_updates=13400, lr=0.000546358, gnorm=0.231, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=12459 epoch 008: 1598 / 1689 loss=3.87, nll_loss=2.352, ppl=5.1, wps=549014, ups=1.11, wpb=494464, bsz=16550.7, num_updates=13400, lr=0.000546358, gnorm=0.231, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=12459 epoch 008: 1598 / 1689 loss=3.87, nll_loss=2.352, ppl=5.1, wps=549014, ups=1.11, wpb=494464, bsz=16550.7, num_updates=13400, lr=0.000546358, gnorm=0.231, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=12459 epoch 008: 1598 / 1689 loss=3.87, nll_loss=2.352, ppl=5.1, wps=549014, ups=1.11, wpb=494464, bsz=16550.7, num_updates=13400, lr=0.000546358, gnorm=0.231, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=12459 end of epoch 8 (average epoch stats below) epoch 008 | loss 3.87 | nll_loss 2.351 | ppl 5.1 | wps 530509 | ups 1.07 | wpb 495118 | bsz 16504.5 | num_updates 13491 | lr 0.000544513 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 22.6 | wall 12540 epoch 008 | loss 3.87 | nll_loss 2.351 | ppl 5.1 | wps 530509 | ups 1.07 | wpb 495118 | bsz 16504.5 | num_updates 13491 | lr 0.000544513 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 22.6 | wall 12540 epoch 008 | loss 3.87 | nll_loss 2.351 | ppl 5.1 | wps 530509 | ups 1.07 | wpb 495118 | bsz 16504.5 | num_updates 13491 | lr 0.000544513 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 22.6 | wall 12540 epoch 008 | loss 3.87 | nll_loss 2.351 | ppl 5.1 | wps 530509 | ups 1.07 | wpb 495118 | bsz 16504.5 | num_updates 13491 | lr 0.000544513 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 22.6 | wall 12540 epoch 008 | loss 3.87 | nll_loss 2.351 | ppl 5.1 | wps 530509 | ups 1.07 | wpb 495118 | bsz 16504.5 | num_updates 13491 | lr 0.000544513 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 22.6 | wall 12540 epoch 008 | loss 3.87 | nll_loss 2.351 | ppl 5.1 | wps 530509 | ups 1.07 | wpb 495118 | bsz 16504.5 | num_updates 13491 | lr 0.000544513 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 22.6 | wall 12540 epoch 008 | loss 3.87 | nll_loss 2.351 | ppl 5.1 | wps 530509 | ups 1.07 | wpb 495118 | bsz 16504.5 | num_updates 13491 | lr 0.000544513 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 22.6 | wall 12540 epoch 008 | loss 3.87 | nll_loss 2.351 | ppl 5.1 | wps 530509 | ups 1.07 | wpb 495118 | bsz 16504.5 | num_updates 13491 | lr 0.000544513 | gnorm 0.226 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 22.6 | wall 12540 Start iterating over samples epoch 009: 9 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=542552, ups=1.1, wpb=491912, bsz=16528.9, num_updates=13500, lr=0.000544331, gnorm=0.236, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=12549 epoch 009: 9 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=542552, ups=1.1, wpb=491912, bsz=16528.9, num_updates=13500, lr=0.000544331, gnorm=0.236, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=12549 epoch 009: 9 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=542552, ups=1.1, wpb=491912, bsz=16528.9, num_updates=13500, lr=0.000544331, gnorm=0.236, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=12549 epoch 009: 9 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=542552, ups=1.1, wpb=491912, bsz=16528.9, num_updates=13500, lr=0.000544331, gnorm=0.236, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=12549 epoch 009: 9 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=542552, ups=1.1, wpb=491912, bsz=16528.9, num_updates=13500, lr=0.000544331, gnorm=0.236, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=12549 epoch 009: 9 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=542552, ups=1.1, wpb=491912, bsz=16528.9, num_updates=13500, lr=0.000544331, gnorm=0.236, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=12549 epoch 009: 9 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=542552, ups=1.1, wpb=491912, bsz=16528.9, num_updates=13500, lr=0.000544331, gnorm=0.236, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=12549 epoch 009: 9 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=542552, ups=1.1, wpb=491912, bsz=16528.9, num_updates=13500, lr=0.000544331, gnorm=0.236, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=12549 epoch 009: 9 / 1689 loss=3.868, nll_loss=2.349, ppl=5.1, wps=542552, ups=1.1, wpb=491912, bsz=16528.9, num_updates=13500, lr=0.000544331, gnorm=0.236, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=12549 epoch 009: 109 / 1689 loss=3.842, nll_loss=2.319, ppl=4.99, wps=551982, ups=1.11, wpb=495608, bsz=16332.6, num_updates=13600, lr=0.000542326, gnorm=0.212, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=12639 epoch 009: 109 / 1689 loss=3.842, nll_loss=2.319, ppl=4.99, wps=551982, ups=1.11, wpb=495608, bsz=16332.6, num_updates=13600, lr=0.000542326, gnorm=0.212, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=12639 epoch 009: 109 / 1689 loss=3.842, nll_loss=2.319, ppl=4.99, wps=551982, ups=1.11, wpb=495608, bsz=16332.6, num_updates=13600, lr=0.000542326, gnorm=0.212, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=12639 epoch 009: 109 / 1689 loss=3.842, nll_loss=2.319, ppl=4.99, wps=551982, ups=1.11, wpb=495608, bsz=16332.6, num_updates=13600, lr=0.000542326, gnorm=0.212, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=12639 epoch 009: 109 / 1689 loss=3.842, nll_loss=2.319, ppl=4.99, wps=551982, ups=1.11, wpb=495608, bsz=16332.6, num_updates=13600, lr=0.000542326, gnorm=0.212, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=12639 epoch 009: 109 / 1689 loss=3.842, nll_loss=2.319, ppl=4.99, wps=551982, ups=1.11, wpb=495608, bsz=16332.6, num_updates=13600, lr=0.000542326, gnorm=0.212, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=12639 epoch 009: 109 / 1689 loss=3.842, nll_loss=2.319, ppl=4.99, wps=551982, ups=1.11, wpb=495608, bsz=16332.6, num_updates=13600, lr=0.000542326, gnorm=0.212, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=12639 epoch 009: 109 / 1689 loss=3.842, nll_loss=2.319, ppl=4.99, wps=551982, ups=1.11, wpb=495608, bsz=16332.6, num_updates=13600, lr=0.000542326, gnorm=0.212, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=12639 epoch 009: 109 / 1689 loss=3.842, nll_loss=2.319, ppl=4.99, wps=551982, ups=1.11, wpb=495608, bsz=16332.6, num_updates=13600, lr=0.000542326, gnorm=0.212, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=12639 epoch 009: 209 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=548111, ups=1.11, wpb=493659, bsz=16754, num_updates=13700, lr=0.000540343, gnorm=0.217, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=12729 epoch 009: 209 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=548111, ups=1.11, wpb=493659, bsz=16754, num_updates=13700, lr=0.000540343, gnorm=0.217, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=12729 epoch 009: 209 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=548111, ups=1.11, wpb=493659, bsz=16754, num_updates=13700, lr=0.000540343, gnorm=0.217, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=12729 epoch 009: 209 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=548111, ups=1.11, wpb=493659, bsz=16754, num_updates=13700, lr=0.000540343, gnorm=0.217, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=12729 epoch 009: 209 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=548111, ups=1.11, wpb=493659, bsz=16754, num_updates=13700, lr=0.000540343, gnorm=0.217, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=12729 epoch 009: 209 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=548111, ups=1.11, wpb=493659, bsz=16754, num_updates=13700, lr=0.000540343, gnorm=0.217, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=12729 epoch 009: 209 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=548111, ups=1.11, wpb=493659, bsz=16754, num_updates=13700, lr=0.000540343, gnorm=0.217, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=12729 epoch 009: 209 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=548111, ups=1.11, wpb=493659, bsz=16754, num_updates=13700, lr=0.000540343, gnorm=0.217, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=12729 epoch 009: 209 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=548111, ups=1.11, wpb=493659, bsz=16754, num_updates=13700, lr=0.000540343, gnorm=0.217, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=12729 epoch 009: 309 / 1689 loss=3.839, nll_loss=2.316, ppl=4.98, wps=551501, ups=1.11, wpb=497012, bsz=16546.1, num_updates=13800, lr=0.000538382, gnorm=0.209, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=12819 epoch 009: 309 / 1689 loss=3.839, nll_loss=2.316, ppl=4.98, wps=551501, ups=1.11, wpb=497012, bsz=16546.1, num_updates=13800, lr=0.000538382, gnorm=0.209, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=12819 epoch 009: 309 / 1689 loss=3.839, nll_loss=2.316, ppl=4.98, wps=551501, ups=1.11, wpb=497012, bsz=16546.1, num_updates=13800, lr=0.000538382, gnorm=0.209, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=12819 epoch 009: 309 / 1689 loss=3.839, nll_loss=2.316, ppl=4.98, wps=551501, ups=1.11, wpb=497012, bsz=16546.1, num_updates=13800, lr=0.000538382, gnorm=0.209, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=12819 epoch 009: 309 / 1689 loss=3.839, nll_loss=2.316, ppl=4.98, wps=551501, ups=1.11, wpb=497012, bsz=16546.1, num_updates=13800, lr=0.000538382, gnorm=0.209, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=12819 epoch 009: 309 / 1689 loss=3.839, nll_loss=2.316, ppl=4.98, wps=551501, ups=1.11, wpb=497012, bsz=16546.1, num_updates=13800, lr=0.000538382, gnorm=0.209, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=12819 epoch 009: 309 / 1689 loss=3.839, nll_loss=2.316, ppl=4.98, wps=551501, ups=1.11, wpb=497012, bsz=16546.1, num_updates=13800, lr=0.000538382, gnorm=0.209, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=12819 epoch 009: 309 / 1689 loss=3.839, nll_loss=2.316, ppl=4.98, wps=551501, ups=1.11, wpb=497012, bsz=16546.1, num_updates=13800, lr=0.000538382, gnorm=0.209, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=12819 epoch 009: 309 / 1689 loss=3.839, nll_loss=2.316, ppl=4.98, wps=551501, ups=1.11, wpb=497012, bsz=16546.1, num_updates=13800, lr=0.000538382, gnorm=0.209, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=12819 epoch 009: 409 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=552876, ups=1.12, wpb=495285, bsz=16275.8, num_updates=13900, lr=0.000536442, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=12909 epoch 009: 409 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=552876, ups=1.12, wpb=495285, bsz=16275.8, num_updates=13900, lr=0.000536442, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=12909 epoch 009: 409 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=552876, ups=1.12, wpb=495285, bsz=16275.8, num_updates=13900, lr=0.000536442, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=12909 epoch 009: 409 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=552876, ups=1.12, wpb=495285, bsz=16275.8, num_updates=13900, lr=0.000536442, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=12909 epoch 009: 409 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=552876, ups=1.12, wpb=495285, bsz=16275.8, num_updates=13900, lr=0.000536442, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=12909 epoch 009: 409 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=552876, ups=1.12, wpb=495285, bsz=16275.8, num_updates=13900, lr=0.000536442, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=12909 epoch 009: 409 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=552876, ups=1.12, wpb=495285, bsz=16275.8, num_updates=13900, lr=0.000536442, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=12909 epoch 009: 409 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=552876, ups=1.12, wpb=495285, bsz=16275.8, num_updates=13900, lr=0.000536442, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=12909 epoch 009: 409 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=552876, ups=1.12, wpb=495285, bsz=16275.8, num_updates=13900, lr=0.000536442, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=12909 epoch 009: 510 / 1689 loss=3.844, nll_loss=2.323, ppl=5, wps=547403, ups=1.11, wpb=494839, bsz=16497.8, num_updates=14000, lr=0.000534522, gnorm=0.216, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=12999 epoch 009: 510 / 1689 loss=3.844, nll_loss=2.323, ppl=5, wps=547403, ups=1.11, wpb=494839, bsz=16497.8, num_updates=14000, lr=0.000534522, gnorm=0.216, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=12999 epoch 009: 510 / 1689 loss=3.844, nll_loss=2.323, ppl=5, wps=547403, ups=1.11, wpb=494839, bsz=16497.8, num_updates=14000, lr=0.000534522, gnorm=0.216, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=12999 epoch 009: 510 / 1689 loss=3.844, nll_loss=2.323, ppl=5, wps=547403, ups=1.11, wpb=494839, bsz=16497.8, num_updates=14000, lr=0.000534522, gnorm=0.216, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=12999 epoch 009: 510 / 1689 loss=3.844, nll_loss=2.323, ppl=5, wps=547403, ups=1.11, wpb=494839, bsz=16497.8, num_updates=14000, lr=0.000534522, gnorm=0.216, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=12999 epoch 009: 510 / 1689 loss=3.844, nll_loss=2.323, ppl=5, wps=547403, ups=1.11, wpb=494839, bsz=16497.8, num_updates=14000, lr=0.000534522, gnorm=0.216, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=12999 epoch 009: 510 / 1689 loss=3.844, nll_loss=2.323, ppl=5, wps=547403, ups=1.11, wpb=494839, bsz=16497.8, num_updates=14000, lr=0.000534522, gnorm=0.216, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=12999 epoch 009: 510 / 1689 loss=3.844, nll_loss=2.323, ppl=5, wps=547403, ups=1.11, wpb=494839, bsz=16497.8, num_updates=14000, lr=0.000534522, gnorm=0.216, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=12999 epoch 009: 510 / 1689 loss=3.844, nll_loss=2.323, ppl=5, wps=547403, ups=1.11, wpb=494839, bsz=16497.8, num_updates=14000, lr=0.000534522, gnorm=0.216, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=12999 begin validation on "valid" subset epoch 009 | valid on 'valid' subset | loss 3.832 | nll_loss 2.254 | ppl 4.77 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.826 epoch 009 | valid on 'valid' subset | loss 3.832 | nll_loss 2.254 | ppl 4.77 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.826 epoch 009 | valid on 'valid' subset | loss 3.832 | nll_loss 2.254 | ppl 4.77 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.826 epoch 009 | valid on 'valid' subset | loss 3.832 | nll_loss 2.254 | ppl 4.77 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.826 epoch 009 | valid on 'valid' subset | loss 3.832 | nll_loss 2.254 | ppl 4.77 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.826 epoch 009 | valid on 'valid' subset | loss 3.832 | nll_loss 2.254 | ppl 4.77 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.826 epoch 009 | valid on 'valid' subset | loss 3.832 | nll_loss 2.254 | ppl 4.77 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.826 epoch 009 | valid on 'valid' subset | loss 3.832 | nll_loss 2.254 | ppl 4.77 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.826 epoch 009 | valid on 'valid' subset | loss 3.832 | nll_loss 2.254 | ppl 4.77 | wps 0 | wpb 44526 | bsz 2008 | num_updates 14000 | best_loss 3.826 epoch 009: 610 / 1689 loss=3.846, nll_loss=2.324, ppl=5.01, wps=489364, ups=0.99, wpb=495231, bsz=16346.6, num_updates=14100, lr=0.000532624, gnorm=0.22, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=13100 epoch 009: 610 / 1689 loss=3.846, nll_loss=2.324, ppl=5.01, wps=489364, ups=0.99, wpb=495231, bsz=16346.6, num_updates=14100, lr=0.000532624, gnorm=0.22, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=13100 epoch 009: 610 / 1689 loss=3.846, nll_loss=2.324, ppl=5.01, wps=489364, ups=0.99, wpb=495231, bsz=16346.6, num_updates=14100, lr=0.000532624, gnorm=0.22, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=13100 epoch 009: 610 / 1689 loss=3.846, nll_loss=2.324, ppl=5.01, wps=489364, ups=0.99, wpb=495231, bsz=16346.6, num_updates=14100, lr=0.000532624, gnorm=0.22, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=13100 epoch 009: 610 / 1689 loss=3.846, nll_loss=2.324, ppl=5.01, wps=489364, ups=0.99, wpb=495231, bsz=16346.6, num_updates=14100, lr=0.000532624, gnorm=0.22, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=13100 epoch 009: 610 / 1689 loss=3.846, nll_loss=2.324, ppl=5.01, wps=489364, ups=0.99, wpb=495231, bsz=16346.6, num_updates=14100, lr=0.000532624, gnorm=0.22, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=13100 epoch 009: 610 / 1689 loss=3.846, nll_loss=2.324, ppl=5.01, wps=489364, ups=0.99, wpb=495231, bsz=16346.6, num_updates=14100, lr=0.000532624, gnorm=0.22, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=13100 epoch 009: 610 / 1689 loss=3.846, nll_loss=2.324, ppl=5.01, wps=489364, ups=0.99, wpb=495231, bsz=16346.6, num_updates=14100, lr=0.000532624, gnorm=0.22, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=13100 epoch 009: 610 / 1689 loss=3.846, nll_loss=2.324, ppl=5.01, wps=489364, ups=0.99, wpb=495231, bsz=16346.6, num_updates=14100, lr=0.000532624, gnorm=0.22, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=13100 epoch 009: 710 / 1689 loss=3.842, nll_loss=2.32, ppl=4.99, wps=552990, ups=1.12, wpb=495538, bsz=16520.7, num_updates=14200, lr=0.000530745, gnorm=0.216, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=13190 epoch 009: 710 / 1689 loss=3.842, nll_loss=2.32, ppl=4.99, wps=552990, ups=1.12, wpb=495538, bsz=16520.7, num_updates=14200, lr=0.000530745, gnorm=0.216, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=13190 epoch 009: 710 / 1689 loss=3.842, nll_loss=2.32, ppl=4.99, wps=552990, ups=1.12, wpb=495538, bsz=16520.7, num_updates=14200, lr=0.000530745, gnorm=0.216, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=13190 epoch 009: 710 / 1689 loss=3.842, nll_loss=2.32, ppl=4.99, wps=552990, ups=1.12, wpb=495538, bsz=16520.7, num_updates=14200, lr=0.000530745, gnorm=0.216, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=13190 epoch 009: 710 / 1689 loss=3.842, nll_loss=2.32, ppl=4.99, wps=552990, ups=1.12, wpb=495538, bsz=16520.7, num_updates=14200, lr=0.000530745, gnorm=0.216, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=13190 epoch 009: 710 / 1689 loss=3.842, nll_loss=2.32, ppl=4.99, wps=552990, ups=1.12, wpb=495538, bsz=16520.7, num_updates=14200, lr=0.000530745, gnorm=0.216, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=13190 epoch 009: 710 / 1689 loss=3.842, nll_loss=2.32, ppl=4.99, wps=552990, ups=1.12, wpb=495538, bsz=16520.7, num_updates=14200, lr=0.000530745, gnorm=0.216, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=13190 epoch 009: 710 / 1689 loss=3.842, nll_loss=2.32, ppl=4.99, wps=552990, ups=1.12, wpb=495538, bsz=16520.7, num_updates=14200, lr=0.000530745, gnorm=0.216, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=13190 epoch 009: 710 / 1689 loss=3.842, nll_loss=2.32, ppl=4.99, wps=552990, ups=1.12, wpb=495538, bsz=16520.7, num_updates=14200, lr=0.000530745, gnorm=0.216, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=13190 epoch 009: 810 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=552853, ups=1.12, wpb=494650, bsz=16655.6, num_updates=14300, lr=0.000528886, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=13279 epoch 009: 810 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=552853, ups=1.12, wpb=494650, bsz=16655.6, num_updates=14300, lr=0.000528886, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=13279 epoch 009: 810 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=552853, ups=1.12, wpb=494650, bsz=16655.6, num_updates=14300, lr=0.000528886, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=13279 epoch 009: 810 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=552853, ups=1.12, wpb=494650, bsz=16655.6, num_updates=14300, lr=0.000528886, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=13279 epoch 009: 810 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=552853, ups=1.12, wpb=494650, bsz=16655.6, num_updates=14300, lr=0.000528886, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=13279 epoch 009: 810 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=552853, ups=1.12, wpb=494650, bsz=16655.6, num_updates=14300, lr=0.000528886, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=13279 epoch 009: 810 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=552853, ups=1.12, wpb=494650, bsz=16655.6, num_updates=14300, lr=0.000528886, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=13279 epoch 009: 810 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=552853, ups=1.12, wpb=494650, bsz=16655.6, num_updates=14300, lr=0.000528886, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=13279 epoch 009: 810 / 1689 loss=3.849, nll_loss=2.328, ppl=5.02, wps=552853, ups=1.12, wpb=494650, bsz=16655.6, num_updates=14300, lr=0.000528886, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=13279 epoch 009: 910 / 1689 loss=3.851, nll_loss=2.33, ppl=5.03, wps=552054, ups=1.11, wpb=495446, bsz=16689.2, num_updates=14400, lr=0.000527046, gnorm=0.218, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=13369 epoch 009: 910 / 1689 loss=3.851, nll_loss=2.33, ppl=5.03, wps=552054, ups=1.11, wpb=495446, bsz=16689.2, num_updates=14400, lr=0.000527046, gnorm=0.218, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=13369 epoch 009: 910 / 1689 loss=3.851, nll_loss=2.33, ppl=5.03, wps=552054, ups=1.11, wpb=495446, bsz=16689.2, num_updates=14400, lr=0.000527046, gnorm=0.218, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=13369 epoch 009: 910 / 1689 loss=3.851, nll_loss=2.33, ppl=5.03, wps=552054, ups=1.11, wpb=495446, bsz=16689.2, num_updates=14400, lr=0.000527046, gnorm=0.218, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=13369 epoch 009: 910 / 1689 loss=3.851, nll_loss=2.33, ppl=5.03, wps=552054, ups=1.11, wpb=495446, bsz=16689.2, num_updates=14400, lr=0.000527046, gnorm=0.218, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=13369 epoch 009: 910 / 1689 loss=3.851, nll_loss=2.33, ppl=5.03, wps=552054, ups=1.11, wpb=495446, bsz=16689.2, num_updates=14400, lr=0.000527046, gnorm=0.218, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=13369 epoch 009: 910 / 1689 loss=3.851, nll_loss=2.33, ppl=5.03, wps=552054, ups=1.11, wpb=495446, bsz=16689.2, num_updates=14400, lr=0.000527046, gnorm=0.218, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=13369 epoch 009: 910 / 1689 loss=3.851, nll_loss=2.33, ppl=5.03, wps=552054, ups=1.11, wpb=495446, bsz=16689.2, num_updates=14400, lr=0.000527046, gnorm=0.218, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=13369 epoch 009: 910 / 1689 loss=3.851, nll_loss=2.33, ppl=5.03, wps=552054, ups=1.11, wpb=495446, bsz=16689.2, num_updates=14400, lr=0.000527046, gnorm=0.218, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=13369 epoch 009: 1011 / 1689 loss=3.847, nll_loss=2.326, ppl=5.01, wps=540950, ups=1.09, wpb=494835, bsz=16341.9, num_updates=14500, lr=0.000525226, gnorm=0.207, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=13461 epoch 009: 1011 / 1689 loss=3.847, nll_loss=2.326, ppl=5.01, wps=540950, ups=1.09, wpb=494835, bsz=16341.9, num_updates=14500, lr=0.000525226, gnorm=0.207, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=13461 epoch 009: 1011 / 1689 loss=3.847, nll_loss=2.326, ppl=5.01, wps=540950, ups=1.09, wpb=494835, bsz=16341.9, num_updates=14500, lr=0.000525226, gnorm=0.207, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=13461 epoch 009: 1011 / 1689 loss=3.847, nll_loss=2.326, ppl=5.01, wps=540950, ups=1.09, wpb=494835, bsz=16341.9, num_updates=14500, lr=0.000525226, gnorm=0.207, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=13461 epoch 009: 1011 / 1689 loss=3.847, nll_loss=2.326, ppl=5.01, wps=540950, ups=1.09, wpb=494835, bsz=16341.9, num_updates=14500, lr=0.000525226, gnorm=0.207, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=13461 epoch 009: 1011 / 1689 loss=3.847, nll_loss=2.326, ppl=5.01, wps=540950, ups=1.09, wpb=494835, bsz=16341.9, num_updates=14500, lr=0.000525226, gnorm=0.207, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=13461 epoch 009: 1011 / 1689 loss=3.847, nll_loss=2.326, ppl=5.01, wps=540950, ups=1.09, wpb=494835, bsz=16341.9, num_updates=14500, lr=0.000525226, gnorm=0.207, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=13461 epoch 009: 1011 / 1689 loss=3.847, nll_loss=2.326, ppl=5.01, wps=540950, ups=1.09, wpb=494835, bsz=16341.9, num_updates=14500, lr=0.000525226, gnorm=0.207, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=13461 epoch 009: 1011 / 1689 loss=3.847, nll_loss=2.326, ppl=5.01, wps=540950, ups=1.09, wpb=494835, bsz=16341.9, num_updates=14500, lr=0.000525226, gnorm=0.207, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=13461 epoch 009: 1111 / 1689 loss=3.851, nll_loss=2.331, ppl=5.03, wps=545205, ups=1.1, wpb=494900, bsz=16756.2, num_updates=14600, lr=0.000523424, gnorm=0.212, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=13551 epoch 009: 1111 / 1689 loss=3.851, nll_loss=2.331, ppl=5.03, wps=545205, ups=1.1, wpb=494900, bsz=16756.2, num_updates=14600, lr=0.000523424, gnorm=0.212, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=13551 epoch 009: 1111 / 1689 loss=3.851, nll_loss=2.331, ppl=5.03, wps=545205, ups=1.1, wpb=494900, bsz=16756.2, num_updates=14600, lr=0.000523424, gnorm=0.212, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=13551 epoch 009: 1111 / 1689 loss=3.851, nll_loss=2.331, ppl=5.03, wps=545205, ups=1.1, wpb=494900, bsz=16756.2, num_updates=14600, lr=0.000523424, gnorm=0.212, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=13551 epoch 009: 1111 / 1689 loss=3.851, nll_loss=2.331, ppl=5.03, wps=545205, ups=1.1, wpb=494900, bsz=16756.2, num_updates=14600, lr=0.000523424, gnorm=0.212, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=13551 epoch 009: 1111 / 1689 loss=3.851, nll_loss=2.331, ppl=5.03, wps=545205, ups=1.1, wpb=494900, bsz=16756.2, num_updates=14600, lr=0.000523424, gnorm=0.212, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=13551 epoch 009: 1111 / 1689 loss=3.851, nll_loss=2.331, ppl=5.03, wps=545205, ups=1.1, wpb=494900, bsz=16756.2, num_updates=14600, lr=0.000523424, gnorm=0.212, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=13551 epoch 009: 1111 / 1689 loss=3.851, nll_loss=2.331, ppl=5.03, wps=545205, ups=1.1, wpb=494900, bsz=16756.2, num_updates=14600, lr=0.000523424, gnorm=0.212, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=13551 epoch 009: 1111 / 1689 loss=3.851, nll_loss=2.331, ppl=5.03, wps=545205, ups=1.1, wpb=494900, bsz=16756.2, num_updates=14600, lr=0.000523424, gnorm=0.212, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=13551 epoch 009: 1212 / 1689 loss=3.844, nll_loss=2.322, ppl=5, wps=547235, ups=1.1, wpb=495387, bsz=16196.2, num_updates=14700, lr=0.000521641, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=13642 epoch 009: 1212 / 1689 loss=3.844, nll_loss=2.322, ppl=5, wps=547235, ups=1.1, wpb=495387, bsz=16196.2, num_updates=14700, lr=0.000521641, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=13642 epoch 009: 1212 / 1689 loss=3.844, nll_loss=2.322, ppl=5, wps=547235, ups=1.1, wpb=495387, bsz=16196.2, num_updates=14700, lr=0.000521641, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=13642 epoch 009: 1212 / 1689 loss=3.844, nll_loss=2.322, ppl=5, wps=547235, ups=1.1, wpb=495387, bsz=16196.2, num_updates=14700, lr=0.000521641, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=13642 epoch 009: 1212 / 1689 loss=3.844, nll_loss=2.322, ppl=5, wps=547235, ups=1.1, wpb=495387, bsz=16196.2, num_updates=14700, lr=0.000521641, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=13642 epoch 009: 1212 / 1689 loss=3.844, nll_loss=2.322, ppl=5, wps=547235, ups=1.1, wpb=495387, bsz=16196.2, num_updates=14700, lr=0.000521641, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=13642 epoch 009: 1212 / 1689 loss=3.844, nll_loss=2.322, ppl=5, wps=547235, ups=1.1, wpb=495387, bsz=16196.2, num_updates=14700, lr=0.000521641, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=13642 epoch 009: 1212 / 1689 loss=3.844, nll_loss=2.322, ppl=5, wps=547235, ups=1.1, wpb=495387, bsz=16196.2, num_updates=14700, lr=0.000521641, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=13642 epoch 009: 1212 / 1689 loss=3.844, nll_loss=2.322, ppl=5, wps=547235, ups=1.1, wpb=495387, bsz=16196.2, num_updates=14700, lr=0.000521641, gnorm=0.221, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=13642 epoch 009: 1312 / 1689 loss=3.845, nll_loss=2.323, ppl=5, wps=549732, ups=1.11, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=13732 epoch 009: 1312 / 1689 loss=3.845, nll_loss=2.323, ppl=5, wps=549732, ups=1.11, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=13732 epoch 009: 1312 / 1689 loss=3.845, nll_loss=2.323, ppl=5, wps=549732, ups=1.11, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=13732 epoch 009: 1312 / 1689 loss=3.845, nll_loss=2.323, ppl=5, wps=549732, ups=1.11, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=13732 epoch 009: 1312 / 1689 loss=3.845, nll_loss=2.323, ppl=5, wps=549732, ups=1.11, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=13732 epoch 009: 1312 / 1689 loss=3.845, nll_loss=2.323, ppl=5, wps=549732, ups=1.11, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=13732 epoch 009: 1312 / 1689 loss=3.845, nll_loss=2.323, ppl=5, wps=549732, ups=1.11, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=13732 epoch 009: 1312 / 1689 loss=3.845, nll_loss=2.323, ppl=5, wps=549732, ups=1.11, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=13732 epoch 009: 1312 / 1689 loss=3.845, nll_loss=2.323, ppl=5, wps=549732, ups=1.11, wpb=495872, bsz=16358.4, num_updates=14800, lr=0.000519875, gnorm=0.216, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=13732 epoch 009: 1412 / 1689 loss=3.841, nll_loss=2.319, ppl=4.99, wps=549627, ups=1.11, wpb=495126, bsz=16667.6, num_updates=14900, lr=0.000518128, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=13822 epoch 009: 1412 / 1689 loss=3.841, nll_loss=2.319, ppl=4.99, wps=549627, ups=1.11, wpb=495126, bsz=16667.6, num_updates=14900, lr=0.000518128, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=13822 epoch 009: 1412 / 1689 loss=3.841, nll_loss=2.319, ppl=4.99, wps=549627, ups=1.11, wpb=495126, bsz=16667.6, num_updates=14900, lr=0.000518128, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=13822 epoch 009: 1412 / 1689 loss=3.841, nll_loss=2.319, ppl=4.99, wps=549627, ups=1.11, wpb=495126, bsz=16667.6, num_updates=14900, lr=0.000518128, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=13822 epoch 009: 1412 / 1689 loss=3.841, nll_loss=2.319, ppl=4.99, wps=549627, ups=1.11, wpb=495126, bsz=16667.6, num_updates=14900, lr=0.000518128, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=13822 epoch 009: 1412 / 1689 loss=3.841, nll_loss=2.319, ppl=4.99, wps=549627, ups=1.11, wpb=495126, bsz=16667.6, num_updates=14900, lr=0.000518128, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=13822 epoch 009: 1412 / 1689 loss=3.841, nll_loss=2.319, ppl=4.99, wps=549627, ups=1.11, wpb=495126, bsz=16667.6, num_updates=14900, lr=0.000518128, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=13822 epoch 009: 1412 / 1689 loss=3.841, nll_loss=2.319, ppl=4.99, wps=549627, ups=1.11, wpb=495126, bsz=16667.6, num_updates=14900, lr=0.000518128, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=13822 epoch 009: 1412 / 1689 loss=3.841, nll_loss=2.319, ppl=4.99, wps=549627, ups=1.11, wpb=495126, bsz=16667.6, num_updates=14900, lr=0.000518128, gnorm=0.219, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=13822 epoch 009: 1512 / 1689 loss=3.849, nll_loss=2.329, ppl=5.03, wps=548717, ups=1.11, wpb=495721, bsz=16446.6, num_updates=15000, lr=0.000516398, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=13913 epoch 009: 1512 / 1689 loss=3.849, nll_loss=2.329, ppl=5.03, wps=548717, ups=1.11, wpb=495721, bsz=16446.6, num_updates=15000, lr=0.000516398, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=13913 epoch 009: 1512 / 1689 loss=3.849, nll_loss=2.329, ppl=5.03, wps=548717, ups=1.11, wpb=495721, bsz=16446.6, num_updates=15000, lr=0.000516398, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=13913 epoch 009: 1512 / 1689 loss=3.849, nll_loss=2.329, ppl=5.03, wps=548717, ups=1.11, wpb=495721, bsz=16446.6, num_updates=15000, lr=0.000516398, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=13913 epoch 009: 1512 / 1689 loss=3.849, nll_loss=2.329, ppl=5.03, wps=548717, ups=1.11, wpb=495721, bsz=16446.6, num_updates=15000, lr=0.000516398, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=13913 epoch 009: 1512 / 1689 loss=3.849, nll_loss=2.329, ppl=5.03, wps=548717, ups=1.11, wpb=495721, bsz=16446.6, num_updates=15000, lr=0.000516398, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=13913 epoch 009: 1512 / 1689 loss=3.849, nll_loss=2.329, ppl=5.03, wps=548717, ups=1.11, wpb=495721, bsz=16446.6, num_updates=15000, lr=0.000516398, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=13913 epoch 009: 1512 / 1689 loss=3.849, nll_loss=2.329, ppl=5.03, wps=548717, ups=1.11, wpb=495721, bsz=16446.6, num_updates=15000, lr=0.000516398, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=13913 epoch 009: 1512 / 1689 loss=3.849, nll_loss=2.329, ppl=5.03, wps=548717, ups=1.11, wpb=495721, bsz=16446.6, num_updates=15000, lr=0.000516398, gnorm=0.22, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=13913 begin validation on "valid" subset epoch 009 | valid on 'valid' subset | loss 3.806 | nll_loss 2.232 | ppl 4.7 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.806 epoch 009 | valid on 'valid' subset | loss 3.806 | nll_loss 2.232 | ppl 4.7 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.806 epoch 009 | valid on 'valid' subset | loss 3.806 | nll_loss 2.232 | ppl 4.7 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.806 epoch 009 | valid on 'valid' subset | loss 3.806 | nll_loss 2.232 | ppl 4.7 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.806 epoch 009 | valid on 'valid' subset | loss 3.806 | nll_loss 2.232 | ppl 4.7 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.806 epoch 009 | valid on 'valid' subset | loss 3.806 | nll_loss 2.232 | ppl 4.7 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.806 epoch 009 | valid on 'valid' subset | loss 3.806 | nll_loss 2.232 | ppl 4.7 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.806 epoch 009 | valid on 'valid' subset | loss 3.806 | nll_loss 2.232 | ppl 4.7 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.806 epoch 009 | valid on 'valid' subset | loss 3.806 | nll_loss 2.232 | ppl 4.7 | wps 0 | wpb 44526 | bsz 2008 | num_updates 15000 | best_loss 3.806 epoch 009: 1612 / 1689 loss=3.842, nll_loss=2.321, ppl=5, wps=457161, ups=0.92, wpb=495497, bsz=16693.8, num_updates=15100, lr=0.000514685, gnorm=0.213, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=14021 epoch 009: 1612 / 1689 loss=3.842, nll_loss=2.321, ppl=5, wps=457161, ups=0.92, wpb=495497, bsz=16693.8, num_updates=15100, lr=0.000514685, gnorm=0.213, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=14021 epoch 009: 1612 / 1689 loss=3.842, nll_loss=2.321, ppl=5, wps=457161, ups=0.92, wpb=495497, bsz=16693.8, num_updates=15100, lr=0.000514685, gnorm=0.213, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=14021 epoch 009: 1612 / 1689 loss=3.842, nll_loss=2.321, ppl=5, wps=457161, ups=0.92, wpb=495497, bsz=16693.8, num_updates=15100, lr=0.000514685, gnorm=0.213, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=14021 epoch 009: 1612 / 1689 loss=3.842, nll_loss=2.321, ppl=5, wps=457161, ups=0.92, wpb=495497, bsz=16693.8, num_updates=15100, lr=0.000514685, gnorm=0.213, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=14021 epoch 009: 1612 / 1689 loss=3.842, nll_loss=2.321, ppl=5, wps=457161, ups=0.92, wpb=495497, bsz=16693.8, num_updates=15100, lr=0.000514685, gnorm=0.213, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=14021 epoch 009: 1612 / 1689 loss=3.842, nll_loss=2.321, ppl=5, wps=457161, ups=0.92, wpb=495497, bsz=16693.8, num_updates=15100, lr=0.000514685, gnorm=0.213, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=14021 epoch 009: 1612 / 1689 loss=3.842, nll_loss=2.321, ppl=5, wps=457161, ups=0.92, wpb=495497, bsz=16693.8, num_updates=15100, lr=0.000514685, gnorm=0.213, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=14021 epoch 009: 1612 / 1689 loss=3.842, nll_loss=2.321, ppl=5, wps=457161, ups=0.92, wpb=495497, bsz=16693.8, num_updates=15100, lr=0.000514685, gnorm=0.213, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=14021 end of epoch 9 (average epoch stats below) epoch 009 | loss 3.846 | nll_loss 2.324 | ppl 5.01 | wps 538665 | ups 1.09 | wpb 495101 | bsz 16504.4 | num_updates 15177 | lr 0.000513378 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 25.3 | wall 14089 epoch 009 | loss 3.846 | nll_loss 2.324 | ppl 5.01 | wps 538665 | ups 1.09 | wpb 495101 | bsz 16504.4 | num_updates 15177 | lr 0.000513378 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 25.3 | wall 14089 epoch 009 | loss 3.846 | nll_loss 2.324 | ppl 5.01 | wps 538665 | ups 1.09 | wpb 495101 | bsz 16504.4 | num_updates 15177 | lr 0.000513378 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 25.3 | wall 14089 epoch 009 | loss 3.846 | nll_loss 2.324 | ppl 5.01 | wps 538665 | ups 1.09 | wpb 495101 | bsz 16504.4 | num_updates 15177 | lr 0.000513378 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 25.3 | wall 14089 epoch 009 | loss 3.846 | nll_loss 2.324 | ppl 5.01 | wps 538665 | ups 1.09 | wpb 495101 | bsz 16504.4 | num_updates 15177 | lr 0.000513378 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 25.3 | wall 14089 epoch 009 | loss 3.846 | nll_loss 2.324 | ppl 5.01 | wps 538665 | ups 1.09 | wpb 495101 | bsz 16504.4 | num_updates 15177 | lr 0.000513378 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 25.3 | wall 14089 epoch 009 | loss 3.846 | nll_loss 2.324 | ppl 5.01 | wps 538665 | ups 1.09 | wpb 495101 | bsz 16504.4 | num_updates 15177 | lr 0.000513378 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 25.3 | wall 14089 epoch 009 | loss 3.846 | nll_loss 2.324 | ppl 5.01 | wps 538665 | ups 1.09 | wpb 495101 | bsz 16504.4 | num_updates 15177 | lr 0.000513378 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 25.3 | wall 14089 epoch 009 | loss 3.846 | nll_loss 2.324 | ppl 5.01 | wps 538665 | ups 1.09 | wpb 495101 | bsz 16504.4 | num_updates 15177 | lr 0.000513378 | gnorm 0.216 | clip 0 | loss_scale 2 | train_wall 1492 | gb_free 25.3 | wall 14089 Start iterating over samples epoch 010: 23 / 1689 loss=3.843, nll_loss=2.322, ppl=5, wps=535275, ups=1.09, wpb=491735, bsz=16624, num_updates=15200, lr=0.000512989, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=14113 epoch 010: 23 / 1689 loss=3.843, nll_loss=2.322, ppl=5, wps=535275, ups=1.09, wpb=491735, bsz=16624, num_updates=15200, lr=0.000512989, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=14113 epoch 010: 23 / 1689 loss=3.843, nll_loss=2.322, ppl=5, wps=535275, ups=1.09, wpb=491735, bsz=16624, num_updates=15200, lr=0.000512989, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=14113 epoch 010: 23 / 1689 loss=3.843, nll_loss=2.322, ppl=5, wps=535275, ups=1.09, wpb=491735, bsz=16624, num_updates=15200, lr=0.000512989, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=14113 epoch 010: 23 / 1689 loss=3.843, nll_loss=2.322, ppl=5, wps=535275, ups=1.09, wpb=491735, bsz=16624, num_updates=15200, lr=0.000512989, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=14113 epoch 010: 23 / 1689 loss=3.843, nll_loss=2.322, ppl=5, wps=535275, ups=1.09, wpb=491735, bsz=16624, num_updates=15200, lr=0.000512989, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=14113 epoch 010: 23 / 1689 loss=3.843, nll_loss=2.322, ppl=5, wps=535275, ups=1.09, wpb=491735, bsz=16624, num_updates=15200, lr=0.000512989, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=14113 epoch 010: 23 / 1689 loss=3.843, nll_loss=2.322, ppl=5, wps=535275, ups=1.09, wpb=491735, bsz=16624, num_updates=15200, lr=0.000512989, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=14113 epoch 010: 23 / 1689 loss=3.843, nll_loss=2.322, ppl=5, wps=535275, ups=1.09, wpb=491735, bsz=16624, num_updates=15200, lr=0.000512989, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=14113 epoch 010: 23 / 1689 loss=3.843, nll_loss=2.322, ppl=5, wps=535275, ups=1.09, wpb=491735, bsz=16624, num_updates=15200, lr=0.000512989, gnorm=0.207, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=14113 epoch 010: 124 / 1689 loss=3.813, nll_loss=2.287, ppl=4.88, wps=541555, ups=1.09, wpb=495515, bsz=16349.3, num_updates=15300, lr=0.00051131, gnorm=0.214, clip=0, loss_scale=2, train_wall=90, gb_free=22.6, wall=14204 epoch 010: 124 / 1689 loss=3.813, nll_loss=2.287, ppl=4.88, wps=541555, ups=1.09, wpb=495515, bsz=16349.3, num_updates=15300, lr=0.00051131, gnorm=0.214, clip=0, loss_scale=2, train_wall=90, gb_free=22.6, wall=14204 epoch 010: 124 / 1689 loss=3.813, nll_loss=2.287, ppl=4.88, wps=541555, ups=1.09, wpb=495515, bsz=16349.3, num_updates=15300, lr=0.00051131, gnorm=0.214, clip=0, loss_scale=2, train_wall=90, gb_free=22.6, wall=14204 epoch 010: 124 / 1689 loss=3.813, nll_loss=2.287, ppl=4.88, wps=541555, ups=1.09, wpb=495515, bsz=16349.3, num_updates=15300, lr=0.00051131, gnorm=0.214, clip=0, loss_scale=2, train_wall=90, gb_free=22.6, wall=14204 epoch 010: 124 / 1689 loss=3.813, nll_loss=2.287, ppl=4.88, wps=541555, ups=1.09, wpb=495515, bsz=16349.3, num_updates=15300, lr=0.00051131, gnorm=0.214, clip=0, loss_scale=2, train_wall=90, gb_free=22.6, wall=14204 epoch 010: 124 / 1689 loss=3.813, nll_loss=2.287, ppl=4.88, wps=541555, ups=1.09, wpb=495515, bsz=16349.3, num_updates=15300, lr=0.00051131, gnorm=0.214, clip=0, loss_scale=2, train_wall=90, gb_free=22.6, wall=14204 epoch 010: 124 / 1689 loss=3.813, nll_loss=2.287, ppl=4.88, wps=541555, ups=1.09, wpb=495515, bsz=16349.3, num_updates=15300, lr=0.00051131, gnorm=0.214, clip=0, loss_scale=2, train_wall=90, gb_free=22.6, wall=14204 epoch 010: 124 / 1689 loss=3.813, nll_loss=2.287, ppl=4.88, wps=541555, ups=1.09, wpb=495515, bsz=16349.3, num_updates=15300, lr=0.00051131, gnorm=0.214, clip=0, loss_scale=2, train_wall=90, gb_free=22.6, wall=14204 epoch 010: 124 / 1689 loss=3.813, nll_loss=2.287, ppl=4.88, wps=541555, ups=1.09, wpb=495515, bsz=16349.3, num_updates=15300, lr=0.00051131, gnorm=0.214, clip=0, loss_scale=2, train_wall=90, gb_free=22.6, wall=14204 epoch 010: 124 / 1689 loss=3.813, nll_loss=2.287, ppl=4.88, wps=541555, ups=1.09, wpb=495515, bsz=16349.3, num_updates=15300, lr=0.00051131, gnorm=0.214, clip=0, loss_scale=2, train_wall=90, gb_free=22.6, wall=14204 epoch 010: 224 / 1689 loss=3.827, nll_loss=2.303, ppl=4.94, wps=551328, ups=1.11, wpb=495280, bsz=16744.1, num_updates=15400, lr=0.000509647, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=14294 epoch 010: 224 / 1689 loss=3.827, nll_loss=2.303, ppl=4.94, wps=551328, ups=1.11, wpb=495280, bsz=16744.1, num_updates=15400, lr=0.000509647, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=14294 epoch 010: 224 / 1689 loss=3.827, nll_loss=2.303, ppl=4.94, wps=551328, ups=1.11, wpb=495280, bsz=16744.1, num_updates=15400, lr=0.000509647, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=14294 epoch 010: 224 / 1689 loss=3.827, nll_loss=2.303, ppl=4.94, wps=551328, ups=1.11, wpb=495280, bsz=16744.1, num_updates=15400, lr=0.000509647, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=14294 epoch 010: 224 / 1689 loss=3.827, nll_loss=2.303, ppl=4.94, wps=551328, ups=1.11, wpb=495280, bsz=16744.1, num_updates=15400, lr=0.000509647, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=14294 epoch 010: 224 / 1689 loss=3.827, nll_loss=2.303, ppl=4.94, wps=551328, ups=1.11, wpb=495280, bsz=16744.1, num_updates=15400, lr=0.000509647, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=14294 epoch 010: 224 / 1689 loss=3.827, nll_loss=2.303, ppl=4.94, wps=551328, ups=1.11, wpb=495280, bsz=16744.1, num_updates=15400, lr=0.000509647, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=14294 epoch 010: 224 / 1689 loss=3.827, nll_loss=2.303, ppl=4.94, wps=551328, ups=1.11, wpb=495280, bsz=16744.1, num_updates=15400, lr=0.000509647, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=14294 epoch 010: 224 / 1689 loss=3.827, nll_loss=2.303, ppl=4.94, wps=551328, ups=1.11, wpb=495280, bsz=16744.1, num_updates=15400, lr=0.000509647, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=14294 epoch 010: 224 / 1689 loss=3.827, nll_loss=2.303, ppl=4.94, wps=551328, ups=1.11, wpb=495280, bsz=16744.1, num_updates=15400, lr=0.000509647, gnorm=0.206, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=14294 epoch 010: 324 / 1689 loss=3.831, nll_loss=2.308, ppl=4.95, wps=548171, ups=1.11, wpb=495349, bsz=16496.1, num_updates=15500, lr=0.000508001, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=14385 epoch 010: 324 / 1689 loss=3.831, nll_loss=2.308, ppl=4.95, wps=548171, ups=1.11, wpb=495349, bsz=16496.1, num_updates=15500, lr=0.000508001, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=14385 epoch 010: 324 / 1689 loss=3.831, nll_loss=2.308, ppl=4.95, wps=548171, ups=1.11, wpb=495349, bsz=16496.1, num_updates=15500, lr=0.000508001, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=14385 epoch 010: 324 / 1689 loss=3.831, nll_loss=2.308, ppl=4.95, wps=548171, ups=1.11, wpb=495349, bsz=16496.1, num_updates=15500, lr=0.000508001, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=14385 epoch 010: 324 / 1689 loss=3.831, nll_loss=2.308, ppl=4.95, wps=548171, ups=1.11, wpb=495349, bsz=16496.1, num_updates=15500, lr=0.000508001, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=14385 epoch 010: 324 / 1689 loss=3.831, nll_loss=2.308, ppl=4.95, wps=548171, ups=1.11, wpb=495349, bsz=16496.1, num_updates=15500, lr=0.000508001, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=14385 epoch 010: 324 / 1689 loss=3.831, nll_loss=2.308, ppl=4.95, wps=548171, ups=1.11, wpb=495349, bsz=16496.1, num_updates=15500, lr=0.000508001, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=14385 epoch 010: 324 / 1689 loss=3.831, nll_loss=2.308, ppl=4.95, wps=548171, ups=1.11, wpb=495349, bsz=16496.1, num_updates=15500, lr=0.000508001, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=14385 epoch 010: 324 / 1689 loss=3.831, nll_loss=2.308, ppl=4.95, wps=548171, ups=1.11, wpb=495349, bsz=16496.1, num_updates=15500, lr=0.000508001, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=14385 epoch 010: 324 / 1689 loss=3.831, nll_loss=2.308, ppl=4.95, wps=548171, ups=1.11, wpb=495349, bsz=16496.1, num_updates=15500, lr=0.000508001, gnorm=0.208, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=14385 epoch 010: 424 / 1689 loss=3.821, nll_loss=2.297, ppl=4.92, wps=550204, ups=1.11, wpb=495917, bsz=16609.7, num_updates=15600, lr=0.00050637, gnorm=0.216, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=14475 epoch 010: 424 / 1689 loss=3.821, nll_loss=2.297, ppl=4.92, wps=550204, ups=1.11, wpb=495917, bsz=16609.7, num_updates=15600, lr=0.00050637, gnorm=0.216, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=14475 epoch 010: 424 / 1689 loss=3.821, nll_loss=2.297, ppl=4.92, wps=550204, ups=1.11, wpb=495917, bsz=16609.7, num_updates=15600, lr=0.00050637, gnorm=0.216, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=14475 epoch 010: 424 / 1689 loss=3.821, nll_loss=2.297, ppl=4.92, wps=550204, ups=1.11, wpb=495917, bsz=16609.7, num_updates=15600, lr=0.00050637, gnorm=0.216, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=14475 epoch 010: 424 / 1689 loss=3.821, nll_loss=2.297, ppl=4.92, wps=550204, ups=1.11, wpb=495917, bsz=16609.7, num_updates=15600, lr=0.00050637, gnorm=0.216, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=14475 epoch 010: 424 / 1689 loss=3.821, nll_loss=2.297, ppl=4.92, wps=550204, ups=1.11, wpb=495917, bsz=16609.7, num_updates=15600, lr=0.00050637, gnorm=0.216, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=14475 epoch 010: 424 / 1689 loss=3.821, nll_loss=2.297, ppl=4.92, wps=550204, ups=1.11, wpb=495917, bsz=16609.7, num_updates=15600, lr=0.00050637, gnorm=0.216, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=14475 epoch 010: 424 / 1689 loss=3.821, nll_loss=2.297, ppl=4.92, wps=550204, ups=1.11, wpb=495917, bsz=16609.7, num_updates=15600, lr=0.00050637, gnorm=0.216, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=14475 epoch 010: 424 / 1689 loss=3.821, nll_loss=2.297, ppl=4.92, wps=550204, ups=1.11, wpb=495917, bsz=16609.7, num_updates=15600, lr=0.00050637, gnorm=0.216, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=14475 epoch 010: 424 / 1689 loss=3.821, nll_loss=2.297, ppl=4.92, wps=550204, ups=1.11, wpb=495917, bsz=16609.7, num_updates=15600, lr=0.00050637, gnorm=0.216, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=14475 epoch 010: 524 / 1689 loss=3.828, nll_loss=2.304, ppl=4.94, wps=552968, ups=1.11, wpb=496666, bsz=16304.5, num_updates=15700, lr=0.000504754, gnorm=0.211, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=14565 epoch 010: 524 / 1689 loss=3.828, nll_loss=2.304, ppl=4.94, wps=552968, ups=1.11, wpb=496666, bsz=16304.5, num_updates=15700, lr=0.000504754, gnorm=0.211, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=14565 epoch 010: 524 / 1689 loss=3.828, nll_loss=2.304, ppl=4.94, wps=552968, ups=1.11, wpb=496666, bsz=16304.5, num_updates=15700, lr=0.000504754, gnorm=0.211, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=14565 epoch 010: 524 / 1689 loss=3.828, nll_loss=2.304, ppl=4.94, wps=552968, ups=1.11, wpb=496666, bsz=16304.5, num_updates=15700, lr=0.000504754, gnorm=0.211, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=14565 epoch 010: 524 / 1689 loss=3.828, nll_loss=2.304, ppl=4.94, wps=552968, ups=1.11, wpb=496666, bsz=16304.5, num_updates=15700, lr=0.000504754, gnorm=0.211, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=14565 epoch 010: 524 / 1689 loss=3.828, nll_loss=2.304, ppl=4.94, wps=552968, ups=1.11, wpb=496666, bsz=16304.5, num_updates=15700, lr=0.000504754, gnorm=0.211, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=14565 epoch 010: 524 / 1689 loss=3.828, nll_loss=2.304, ppl=4.94, wps=552968, ups=1.11, wpb=496666, bsz=16304.5, num_updates=15700, lr=0.000504754, gnorm=0.211, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=14565 epoch 010: 524 / 1689 loss=3.828, nll_loss=2.304, ppl=4.94, wps=552968, ups=1.11, wpb=496666, bsz=16304.5, num_updates=15700, lr=0.000504754, gnorm=0.211, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=14565 epoch 010: 524 / 1689 loss=3.828, nll_loss=2.304, ppl=4.94, wps=552968, ups=1.11, wpb=496666, bsz=16304.5, num_updates=15700, lr=0.000504754, gnorm=0.211, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=14565 epoch 010: 524 / 1689 loss=3.828, nll_loss=2.304, ppl=4.94, wps=552968, ups=1.11, wpb=496666, bsz=16304.5, num_updates=15700, lr=0.000504754, gnorm=0.211, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=14565 epoch 010: 624 / 1689 loss=3.816, nll_loss=2.292, ppl=4.9, wps=552619, ups=1.12, wpb=495407, bsz=16805.3, num_updates=15800, lr=0.000503155, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=14654 epoch 010: 624 / 1689 loss=3.816, nll_loss=2.292, ppl=4.9, wps=552619, ups=1.12, wpb=495407, bsz=16805.3, num_updates=15800, lr=0.000503155, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=14654 epoch 010: 624 / 1689 loss=3.816, nll_loss=2.292, ppl=4.9, wps=552619, ups=1.12, wpb=495407, bsz=16805.3, num_updates=15800, lr=0.000503155, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=14654 epoch 010: 624 / 1689 loss=3.816, nll_loss=2.292, ppl=4.9, wps=552619, ups=1.12, wpb=495407, bsz=16805.3, num_updates=15800, lr=0.000503155, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=14654 epoch 010: 624 / 1689 loss=3.816, nll_loss=2.292, ppl=4.9, wps=552619, ups=1.12, wpb=495407, bsz=16805.3, num_updates=15800, lr=0.000503155, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=14654 epoch 010: 624 / 1689 loss=3.816, nll_loss=2.292, ppl=4.9, wps=552619, ups=1.12, wpb=495407, bsz=16805.3, num_updates=15800, lr=0.000503155, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=14654 epoch 010: 624 / 1689 loss=3.816, nll_loss=2.292, ppl=4.9, wps=552619, ups=1.12, wpb=495407, bsz=16805.3, num_updates=15800, lr=0.000503155, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=14654 epoch 010: 624 / 1689 loss=3.816, nll_loss=2.292, ppl=4.9, wps=552619, ups=1.12, wpb=495407, bsz=16805.3, num_updates=15800, lr=0.000503155, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=14654 epoch 010: 624 / 1689 loss=3.816, nll_loss=2.292, ppl=4.9, wps=552619, ups=1.12, wpb=495407, bsz=16805.3, num_updates=15800, lr=0.000503155, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=14654 epoch 010: 624 / 1689 loss=3.816, nll_loss=2.292, ppl=4.9, wps=552619, ups=1.12, wpb=495407, bsz=16805.3, num_updates=15800, lr=0.000503155, gnorm=0.219, clip=0, loss_scale=4, train_wall=88, gb_free=22.5, wall=14654 epoch 010: 724 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=551585, ups=1.11, wpb=495419, bsz=16520.5, num_updates=15900, lr=0.00050157, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=14744 epoch 010: 724 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=551585, ups=1.11, wpb=495419, bsz=16520.5, num_updates=15900, lr=0.00050157, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=14744 epoch 010: 724 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=551585, ups=1.11, wpb=495419, bsz=16520.5, num_updates=15900, lr=0.00050157, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=14744 epoch 010: 724 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=551585, ups=1.11, wpb=495419, bsz=16520.5, num_updates=15900, lr=0.00050157, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=14744 epoch 010: 724 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=551585, ups=1.11, wpb=495419, bsz=16520.5, num_updates=15900, lr=0.00050157, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=14744 epoch 010: 724 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=551585, ups=1.11, wpb=495419, bsz=16520.5, num_updates=15900, lr=0.00050157, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=14744 epoch 010: 724 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=551585, ups=1.11, wpb=495419, bsz=16520.5, num_updates=15900, lr=0.00050157, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=14744 epoch 010: 724 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=551585, ups=1.11, wpb=495419, bsz=16520.5, num_updates=15900, lr=0.00050157, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=14744 epoch 010: 724 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=551585, ups=1.11, wpb=495419, bsz=16520.5, num_updates=15900, lr=0.00050157, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=14744 epoch 010: 724 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=551585, ups=1.11, wpb=495419, bsz=16520.5, num_updates=15900, lr=0.00050157, gnorm=0.214, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=14744 epoch 010: 824 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=550816, ups=1.11, wpb=495783, bsz=16543.2, num_updates=16000, lr=0.0005, gnorm=0.217, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=14834 epoch 010: 824 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=550816, ups=1.11, wpb=495783, bsz=16543.2, num_updates=16000, lr=0.0005, gnorm=0.217, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=14834 epoch 010: 824 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=550816, ups=1.11, wpb=495783, bsz=16543.2, num_updates=16000, lr=0.0005, gnorm=0.217, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=14834 epoch 010: 824 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=550816, ups=1.11, wpb=495783, bsz=16543.2, num_updates=16000, lr=0.0005, gnorm=0.217, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=14834 epoch 010: 824 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=550816, ups=1.11, wpb=495783, bsz=16543.2, num_updates=16000, lr=0.0005, gnorm=0.217, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=14834 epoch 010: 824 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=550816, ups=1.11, wpb=495783, bsz=16543.2, num_updates=16000, lr=0.0005, gnorm=0.217, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=14834 epoch 010: 824 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=550816, ups=1.11, wpb=495783, bsz=16543.2, num_updates=16000, lr=0.0005, gnorm=0.217, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=14834 epoch 010: 824 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=550816, ups=1.11, wpb=495783, bsz=16543.2, num_updates=16000, lr=0.0005, gnorm=0.217, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=14834 epoch 010: 824 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=550816, ups=1.11, wpb=495783, bsz=16543.2, num_updates=16000, lr=0.0005, gnorm=0.217, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=14834 epoch 010: 824 / 1689 loss=3.827, nll_loss=2.304, ppl=4.94, wps=550816, ups=1.11, wpb=495783, bsz=16543.2, num_updates=16000, lr=0.0005, gnorm=0.217, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=14834 begin validation on "valid" subset epoch 010 | valid on 'valid' subset | loss 3.802 | nll_loss 2.224 | ppl 4.67 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.802 epoch 010 | valid on 'valid' subset | loss 3.802 | nll_loss 2.224 | ppl 4.67 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.802 epoch 010 | valid on 'valid' subset | loss 3.802 | nll_loss 2.224 | ppl 4.67 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.802 epoch 010 | valid on 'valid' subset | loss 3.802 | nll_loss 2.224 | ppl 4.67 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.802 epoch 010 | valid on 'valid' subset | loss 3.802 | nll_loss 2.224 | ppl 4.67 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.802 epoch 010 | valid on 'valid' subset | loss 3.802 | nll_loss 2.224 | ppl 4.67 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.802 epoch 010 | valid on 'valid' subset | loss 3.802 | nll_loss 2.224 | ppl 4.67 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.802 epoch 010 | valid on 'valid' subset | loss 3.802 | nll_loss 2.224 | ppl 4.67 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.802 epoch 010 | valid on 'valid' subset | loss 3.802 | nll_loss 2.224 | ppl 4.67 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.802 epoch 010 | valid on 'valid' subset | loss 3.802 | nll_loss 2.224 | ppl 4.67 | wps 0 | wpb 44526 | bsz 2008 | num_updates 16000 | best_loss 3.802 epoch 010: 924 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=331358, ups=0.67, wpb=496619, bsz=16738.4, num_updates=16100, lr=0.000498445, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=14984 epoch 010: 924 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=331358, ups=0.67, wpb=496619, bsz=16738.4, num_updates=16100, lr=0.000498445, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=14984 epoch 010: 924 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=331358, ups=0.67, wpb=496619, bsz=16738.4, num_updates=16100, lr=0.000498445, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=14984 epoch 010: 924 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=331358, ups=0.67, wpb=496619, bsz=16738.4, num_updates=16100, lr=0.000498445, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=14984 epoch 010: 924 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=331358, ups=0.67, wpb=496619, bsz=16738.4, num_updates=16100, lr=0.000498445, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=14984 epoch 010: 924 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=331358, ups=0.67, wpb=496619, bsz=16738.4, num_updates=16100, lr=0.000498445, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=14984 epoch 010: 924 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=331358, ups=0.67, wpb=496619, bsz=16738.4, num_updates=16100, lr=0.000498445, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=14984 epoch 010: 924 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=331358, ups=0.67, wpb=496619, bsz=16738.4, num_updates=16100, lr=0.000498445, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=14984 epoch 010: 924 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=331358, ups=0.67, wpb=496619, bsz=16738.4, num_updates=16100, lr=0.000498445, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=14984 epoch 010: 924 / 1689 loss=3.824, nll_loss=2.301, ppl=4.93, wps=331358, ups=0.67, wpb=496619, bsz=16738.4, num_updates=16100, lr=0.000498445, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=14984 epoch 010: 1026 / 1689 loss=3.831, nll_loss=2.309, ppl=4.96, wps=547659, ups=1.1, wpb=495654, bsz=16610.8, num_updates=16200, lr=0.000496904, gnorm=0.209, clip=0, loss_scale=1, train_wall=90, gb_free=21, wall=15074 epoch 010: 1026 / 1689 loss=3.831, nll_loss=2.309, ppl=4.96, wps=547659, ups=1.1, wpb=495654, bsz=16610.8, num_updates=16200, lr=0.000496904, gnorm=0.209, clip=0, loss_scale=1, train_wall=90, gb_free=21, wall=15074 epoch 010: 1026 / 1689 loss=3.831, nll_loss=2.309, ppl=4.96, wps=547659, ups=1.1, wpb=495654, bsz=16610.8, num_updates=16200, lr=0.000496904, gnorm=0.209, clip=0, loss_scale=1, train_wall=90, gb_free=21, wall=15074 epoch 010: 1026 / 1689 loss=3.831, nll_loss=2.309, ppl=4.96, wps=547659, ups=1.1, wpb=495654, bsz=16610.8, num_updates=16200, lr=0.000496904, gnorm=0.209, clip=0, loss_scale=1, train_wall=90, gb_free=21, wall=15074 epoch 010: 1026 / 1689 loss=3.831, nll_loss=2.309, ppl=4.96, wps=547659, ups=1.1, wpb=495654, bsz=16610.8, num_updates=16200, lr=0.000496904, gnorm=0.209, clip=0, loss_scale=1, train_wall=90, gb_free=21, wall=15074 epoch 010: 1026 / 1689 loss=3.831, nll_loss=2.309, ppl=4.96, wps=547659, ups=1.1, wpb=495654, bsz=16610.8, num_updates=16200, lr=0.000496904, gnorm=0.209, clip=0, loss_scale=1, train_wall=90, gb_free=21, wall=15074 epoch 010: 1026 / 1689 loss=3.831, nll_loss=2.309, ppl=4.96, wps=547659, ups=1.1, wpb=495654, bsz=16610.8, num_updates=16200, lr=0.000496904, gnorm=0.209, clip=0, loss_scale=1, train_wall=90, gb_free=21, wall=15074 epoch 010: 1026 / 1689 loss=3.831, nll_loss=2.309, ppl=4.96, wps=547659, ups=1.1, wpb=495654, bsz=16610.8, num_updates=16200, lr=0.000496904, gnorm=0.209, clip=0, loss_scale=1, train_wall=90, gb_free=21, wall=15074 epoch 010: 1026 / 1689 loss=3.831, nll_loss=2.309, ppl=4.96, wps=547659, ups=1.1, wpb=495654, bsz=16610.8, num_updates=16200, lr=0.000496904, gnorm=0.209, clip=0, loss_scale=1, train_wall=90, gb_free=21, wall=15074 epoch 010: 1026 / 1689 loss=3.831, nll_loss=2.309, ppl=4.96, wps=547659, ups=1.1, wpb=495654, bsz=16610.8, num_updates=16200, lr=0.000496904, gnorm=0.209, clip=0, loss_scale=1, train_wall=90, gb_free=21, wall=15074 epoch 010: 1126 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=556585, ups=1.13, wpb=494708, bsz=16381.8, num_updates=16300, lr=0.000495377, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=15163 epoch 010: 1126 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=556585, ups=1.13, wpb=494708, bsz=16381.8, num_updates=16300, lr=0.000495377, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=15163 epoch 010: 1126 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=556585, ups=1.13, wpb=494708, bsz=16381.8, num_updates=16300, lr=0.000495377, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=15163 epoch 010: 1126 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=556585, ups=1.13, wpb=494708, bsz=16381.8, num_updates=16300, lr=0.000495377, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=15163 epoch 010: 1126 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=556585, ups=1.13, wpb=494708, bsz=16381.8, num_updates=16300, lr=0.000495377, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=15163 epoch 010: 1126 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=556585, ups=1.13, wpb=494708, bsz=16381.8, num_updates=16300, lr=0.000495377, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=15163 epoch 010: 1126 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=556585, ups=1.13, wpb=494708, bsz=16381.8, num_updates=16300, lr=0.000495377, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=15163 epoch 010: 1126 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=556585, ups=1.13, wpb=494708, bsz=16381.8, num_updates=16300, lr=0.000495377, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=15163 epoch 010: 1126 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=556585, ups=1.13, wpb=494708, bsz=16381.8, num_updates=16300, lr=0.000495377, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=15163 epoch 010: 1126 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=556585, ups=1.13, wpb=494708, bsz=16381.8, num_updates=16300, lr=0.000495377, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=15163 epoch 010: 1226 / 1689 loss=3.828, nll_loss=2.305, ppl=4.94, wps=552797, ups=1.12, wpb=495352, bsz=16436.3, num_updates=16400, lr=0.000493865, gnorm=0.21, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=15253 epoch 010: 1226 / 1689 loss=3.828, nll_loss=2.305, ppl=4.94, wps=552797, ups=1.12, wpb=495352, bsz=16436.3, num_updates=16400, lr=0.000493865, gnorm=0.21, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=15253 epoch 010: 1226 / 1689 loss=3.828, nll_loss=2.305, ppl=4.94, wps=552797, ups=1.12, wpb=495352, bsz=16436.3, num_updates=16400, lr=0.000493865, gnorm=0.21, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=15253 epoch 010: 1226 / 1689 loss=3.828, nll_loss=2.305, ppl=4.94, wps=552797, ups=1.12, wpb=495352, bsz=16436.3, num_updates=16400, lr=0.000493865, gnorm=0.21, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=15253 epoch 010: 1226 / 1689 loss=3.828, nll_loss=2.305, ppl=4.94, wps=552797, ups=1.12, wpb=495352, bsz=16436.3, num_updates=16400, lr=0.000493865, gnorm=0.21, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=15253 epoch 010: 1226 / 1689 loss=3.828, nll_loss=2.305, ppl=4.94, wps=552797, ups=1.12, wpb=495352, bsz=16436.3, num_updates=16400, lr=0.000493865, gnorm=0.21, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=15253 epoch 010: 1226 / 1689 loss=3.828, nll_loss=2.305, ppl=4.94, wps=552797, ups=1.12, wpb=495352, bsz=16436.3, num_updates=16400, lr=0.000493865, gnorm=0.21, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=15253 epoch 010: 1226 / 1689 loss=3.828, nll_loss=2.305, ppl=4.94, wps=552797, ups=1.12, wpb=495352, bsz=16436.3, num_updates=16400, lr=0.000493865, gnorm=0.21, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=15253 epoch 010: 1226 / 1689 loss=3.828, nll_loss=2.305, ppl=4.94, wps=552797, ups=1.12, wpb=495352, bsz=16436.3, num_updates=16400, lr=0.000493865, gnorm=0.21, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=15253 epoch 010: 1226 / 1689 loss=3.828, nll_loss=2.305, ppl=4.94, wps=552797, ups=1.12, wpb=495352, bsz=16436.3, num_updates=16400, lr=0.000493865, gnorm=0.21, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=15253 epoch 010: 1326 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=553107, ups=1.12, wpb=494766, bsz=16597.4, num_updates=16500, lr=0.000492366, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=18.9, wall=15342 epoch 010: 1326 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=553107, ups=1.12, wpb=494766, bsz=16597.4, num_updates=16500, lr=0.000492366, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=18.9, wall=15342 epoch 010: 1326 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=553107, ups=1.12, wpb=494766, bsz=16597.4, num_updates=16500, lr=0.000492366, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=18.9, wall=15342 epoch 010: 1326 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=553107, ups=1.12, wpb=494766, bsz=16597.4, num_updates=16500, lr=0.000492366, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=18.9, wall=15342 epoch 010: 1326 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=553107, ups=1.12, wpb=494766, bsz=16597.4, num_updates=16500, lr=0.000492366, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=18.9, wall=15342 epoch 010: 1326 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=553107, ups=1.12, wpb=494766, bsz=16597.4, num_updates=16500, lr=0.000492366, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=18.9, wall=15342 epoch 010: 1326 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=553107, ups=1.12, wpb=494766, bsz=16597.4, num_updates=16500, lr=0.000492366, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=18.9, wall=15342 epoch 010: 1326 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=553107, ups=1.12, wpb=494766, bsz=16597.4, num_updates=16500, lr=0.000492366, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=18.9, wall=15342 epoch 010: 1326 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=553107, ups=1.12, wpb=494766, bsz=16597.4, num_updates=16500, lr=0.000492366, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=18.9, wall=15342 epoch 010: 1326 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=553107, ups=1.12, wpb=494766, bsz=16597.4, num_updates=16500, lr=0.000492366, gnorm=0.206, clip=0, loss_scale=1, train_wall=88, gb_free=18.9, wall=15342 epoch 010: 1426 / 1689 loss=3.824, nll_loss=2.302, ppl=4.93, wps=552402, ups=1.11, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.214, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=15432 epoch 010: 1426 / 1689 loss=3.824, nll_loss=2.302, ppl=4.93, wps=552402, ups=1.11, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.214, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=15432 epoch 010: 1426 / 1689 loss=3.824, nll_loss=2.302, ppl=4.93, wps=552402, ups=1.11, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.214, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=15432 epoch 010: 1426 / 1689 loss=3.824, nll_loss=2.302, ppl=4.93, wps=552402, ups=1.11, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.214, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=15432 epoch 010: 1426 / 1689 loss=3.824, nll_loss=2.302, ppl=4.93, wps=552402, ups=1.11, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.214, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=15432 epoch 010: 1426 / 1689 loss=3.824, nll_loss=2.302, ppl=4.93, wps=552402, ups=1.11, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.214, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=15432 epoch 010: 1426 / 1689 loss=3.824, nll_loss=2.302, ppl=4.93, wps=552402, ups=1.11, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.214, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=15432 epoch 010: 1426 / 1689 loss=3.824, nll_loss=2.302, ppl=4.93, wps=552402, ups=1.11, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.214, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=15432 epoch 010: 1426 / 1689 loss=3.824, nll_loss=2.302, ppl=4.93, wps=552402, ups=1.11, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.214, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=15432 epoch 010: 1426 / 1689 loss=3.824, nll_loss=2.302, ppl=4.93, wps=552402, ups=1.11, wpb=495668, bsz=16556.7, num_updates=16600, lr=0.000490881, gnorm=0.214, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=15432 epoch 010: 1526 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=552499, ups=1.12, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15521 epoch 010: 1526 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=552499, ups=1.12, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15521 epoch 010: 1526 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=552499, ups=1.12, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15521 epoch 010: 1526 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=552499, ups=1.12, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15521 epoch 010: 1526 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=552499, ups=1.12, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15521 epoch 010: 1526 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=552499, ups=1.12, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15521 epoch 010: 1526 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=552499, ups=1.12, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15521 epoch 010: 1526 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=552499, ups=1.12, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15521 epoch 010: 1526 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=552499, ups=1.12, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15521 epoch 010: 1526 / 1689 loss=3.83, nll_loss=2.308, ppl=4.95, wps=552499, ups=1.12, wpb=493539, bsz=16402.2, num_updates=16700, lr=0.000489409, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=15521 epoch 010: 1627 / 1689 loss=3.82, nll_loss=2.297, ppl=4.91, wps=546942, ups=1.1, wpb=495585, bsz=16314.5, num_updates=16800, lr=0.00048795, gnorm=0.202, clip=0, loss_scale=1, train_wall=89, gb_free=17.8, wall=15612 epoch 010: 1627 / 1689 loss=3.82, nll_loss=2.297, ppl=4.91, wps=546942, ups=1.1, wpb=495585, bsz=16314.5, num_updates=16800, lr=0.00048795, gnorm=0.202, clip=0, loss_scale=1, train_wall=89, gb_free=17.8, wall=15612 epoch 010: 1627 / 1689 loss=3.82, nll_loss=2.297, ppl=4.91, wps=546942, ups=1.1, wpb=495585, bsz=16314.5, num_updates=16800, lr=0.00048795, gnorm=0.202, clip=0, loss_scale=1, train_wall=89, gb_free=17.8, wall=15612 epoch 010: 1627 / 1689 loss=3.82, nll_loss=2.297, ppl=4.91, wps=546942, ups=1.1, wpb=495585, bsz=16314.5, num_updates=16800, lr=0.00048795, gnorm=0.202, clip=0, loss_scale=1, train_wall=89, gb_free=17.8, wall=15612 epoch 010: 1627 / 1689 loss=3.82, nll_loss=2.297, ppl=4.91, wps=546942, ups=1.1, wpb=495585, bsz=16314.5, num_updates=16800, lr=0.00048795, gnorm=0.202, clip=0, loss_scale=1, train_wall=89, gb_free=17.8, wall=15612 epoch 010: 1627 / 1689 loss=3.82, nll_loss=2.297, ppl=4.91, wps=546942, ups=1.1, wpb=495585, bsz=16314.5, num_updates=16800, lr=0.00048795, gnorm=0.202, clip=0, loss_scale=1, train_wall=89, gb_free=17.8, wall=15612 epoch 010: 1627 / 1689 loss=3.82, nll_loss=2.297, ppl=4.91, wps=546942, ups=1.1, wpb=495585, bsz=16314.5, num_updates=16800, lr=0.00048795, gnorm=0.202, clip=0, loss_scale=1, train_wall=89, gb_free=17.8, wall=15612 epoch 010: 1627 / 1689 loss=3.82, nll_loss=2.297, ppl=4.91, wps=546942, ups=1.1, wpb=495585, bsz=16314.5, num_updates=16800, lr=0.00048795, gnorm=0.202, clip=0, loss_scale=1, train_wall=89, gb_free=17.8, wall=15612 epoch 010: 1627 / 1689 loss=3.82, nll_loss=2.297, ppl=4.91, wps=546942, ups=1.1, wpb=495585, bsz=16314.5, num_updates=16800, lr=0.00048795, gnorm=0.202, clip=0, loss_scale=1, train_wall=89, gb_free=17.8, wall=15612 epoch 010: 1627 / 1689 loss=3.82, nll_loss=2.297, ppl=4.91, wps=546942, ups=1.1, wpb=495585, bsz=16314.5, num_updates=16800, lr=0.00048795, gnorm=0.202, clip=0, loss_scale=1, train_wall=89, gb_free=17.8, wall=15612 end of epoch 10 (average epoch stats below) epoch 010 | loss 3.825 | nll_loss 2.302 | ppl 4.93 | wps 528961 | ups 1.07 | wpb 495129 | bsz 16507.1 | num_updates 16862 | lr 0.000487052 | gnorm 0.209 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.4 | wall 15667 epoch 010 | loss 3.825 | nll_loss 2.302 | ppl 4.93 | wps 528961 | ups 1.07 | wpb 495129 | bsz 16507.1 | num_updates 16862 | lr 0.000487052 | gnorm 0.209 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.4 | wall 15667 epoch 010 | loss 3.825 | nll_loss 2.302 | ppl 4.93 | wps 528961 | ups 1.07 | wpb 495129 | bsz 16507.1 | num_updates 16862 | lr 0.000487052 | gnorm 0.209 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.4 | wall 15667 epoch 010 | loss 3.825 | nll_loss 2.302 | ppl 4.93 | wps 528961 | ups 1.07 | wpb 495129 | bsz 16507.1 | num_updates 16862 | lr 0.000487052 | gnorm 0.209 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.4 | wall 15667 epoch 010 | loss 3.825 | nll_loss 2.302 | ppl 4.93 | wps 528961 | ups 1.07 | wpb 495129 | bsz 16507.1 | num_updates 16862 | lr 0.000487052 | gnorm 0.209 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.4 | wall 15667 epoch 010 | loss 3.825 | nll_loss 2.302 | ppl 4.93 | wps 528961 | ups 1.07 | wpb 495129 | bsz 16507.1 | num_updates 16862 | lr 0.000487052 | gnorm 0.209 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.4 | wall 15667 epoch 010 | loss 3.825 | nll_loss 2.302 | ppl 4.93 | wps 528961 | ups 1.07 | wpb 495129 | bsz 16507.1 | num_updates 16862 | lr 0.000487052 | gnorm 0.209 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.4 | wall 15667 epoch 010 | loss 3.825 | nll_loss 2.302 | ppl 4.93 | wps 528961 | ups 1.07 | wpb 495129 | bsz 16507.1 | num_updates 16862 | lr 0.000487052 | gnorm 0.209 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.4 | wall 15667 epoch 010 | loss 3.825 | nll_loss 2.302 | ppl 4.93 | wps 528961 | ups 1.07 | wpb 495129 | bsz 16507.1 | num_updates 16862 | lr 0.000487052 | gnorm 0.209 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.4 | wall 15667 epoch 010 | loss 3.825 | nll_loss 2.302 | ppl 4.93 | wps 528961 | ups 1.07 | wpb 495129 | bsz 16507.1 | num_updates 16862 | lr 0.000487052 | gnorm 0.209 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 23.4 | wall 15667 Start iterating over samples epoch 011: 38 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=540458, ups=1.1, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=15703 epoch 011: 38 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=540458, ups=1.1, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=15703 epoch 011: 38 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=540458, ups=1.1, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=15703 epoch 011: 38 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=540458, ups=1.1, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=15703 epoch 011: 38 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=540458, ups=1.1, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=15703 epoch 011: 38 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=540458, ups=1.1, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=15703 epoch 011: 38 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=540458, ups=1.1, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=15703 epoch 011: 38 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=540458, ups=1.1, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=15703 epoch 011: 38 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=540458, ups=1.1, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=15703 epoch 011: 38 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=540458, ups=1.1, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=15703 epoch 011: 38 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=540458, ups=1.1, wpb=490748, bsz=16033.4, num_updates=16900, lr=0.000486504, gnorm=0.204, clip=0, loss_scale=1, train_wall=87, gb_free=21.8, wall=15703 epoch 011: 138 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=550847, ups=1.11, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=15793 epoch 011: 138 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=550847, ups=1.11, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=15793 epoch 011: 138 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=550847, ups=1.11, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=15793 epoch 011: 138 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=550847, ups=1.11, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=15793 epoch 011: 138 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=550847, ups=1.11, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=15793 epoch 011: 138 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=550847, ups=1.11, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=15793 epoch 011: 138 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=550847, ups=1.11, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=15793 epoch 011: 138 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=550847, ups=1.11, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=15793 epoch 011: 138 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=550847, ups=1.11, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=15793 epoch 011: 138 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=550847, ups=1.11, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=15793 epoch 011: 138 / 1689 loss=3.802, nll_loss=2.276, ppl=4.84, wps=550847, ups=1.11, wpb=496248, bsz=16711, num_updates=17000, lr=0.000485071, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=15793 begin validation on "valid" subset epoch 011 | valid on 'valid' subset | loss 3.811 | nll_loss 2.247 | ppl 4.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.802 epoch 011 | valid on 'valid' subset | loss 3.811 | nll_loss 2.247 | ppl 4.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.802 epoch 011 | valid on 'valid' subset | loss 3.811 | nll_loss 2.247 | ppl 4.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.802 epoch 011 | valid on 'valid' subset | loss 3.811 | nll_loss 2.247 | ppl 4.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.802 epoch 011 | valid on 'valid' subset | loss 3.811 | nll_loss 2.247 | ppl 4.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.802 epoch 011 | valid on 'valid' subset | loss 3.811 | nll_loss 2.247 | ppl 4.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.802 epoch 011 | valid on 'valid' subset | loss 3.811 | nll_loss 2.247 | ppl 4.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.802 epoch 011 | valid on 'valid' subset | loss 3.811 | nll_loss 2.247 | ppl 4.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.802 epoch 011 | valid on 'valid' subset | loss 3.811 | nll_loss 2.247 | ppl 4.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.802 epoch 011 | valid on 'valid' subset | loss 3.811 | nll_loss 2.247 | ppl 4.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.802 epoch 011 | valid on 'valid' subset | loss 3.811 | nll_loss 2.247 | ppl 4.75 | wps 0 | wpb 44526 | bsz 2008 | num_updates 17000 | best_loss 3.802 epoch 011: 238 / 1689 loss=3.8, nll_loss=2.274, ppl=4.84, wps=480302, ups=0.97, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=15896 epoch 011: 238 / 1689 loss=3.8, nll_loss=2.274, ppl=4.84, wps=480302, ups=0.97, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=15896 epoch 011: 238 / 1689 loss=3.8, nll_loss=2.274, ppl=4.84, wps=480302, ups=0.97, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=15896 epoch 011: 238 / 1689 loss=3.8, nll_loss=2.274, ppl=4.84, wps=480302, ups=0.97, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=15896 epoch 011: 238 / 1689 loss=3.8, nll_loss=2.274, ppl=4.84, wps=480302, ups=0.97, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=15896 epoch 011: 238 / 1689 loss=3.8, nll_loss=2.274, ppl=4.84, wps=480302, ups=0.97, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=15896 epoch 011: 238 / 1689 loss=3.8, nll_loss=2.274, ppl=4.84, wps=480302, ups=0.97, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=15896 epoch 011: 238 / 1689 loss=3.8, nll_loss=2.274, ppl=4.84, wps=480302, ups=0.97, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=15896 epoch 011: 238 / 1689 loss=3.8, nll_loss=2.274, ppl=4.84, wps=480302, ups=0.97, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=15896 epoch 011: 238 / 1689 loss=3.8, nll_loss=2.274, ppl=4.84, wps=480302, ups=0.97, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=15896 epoch 011: 238 / 1689 loss=3.8, nll_loss=2.274, ppl=4.84, wps=480302, ups=0.97, wpb=495591, bsz=16546.2, num_updates=17100, lr=0.000483651, gnorm=0.203, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=15896 epoch 011: 338 / 1689 loss=3.808, nll_loss=2.283, ppl=4.87, wps=552535, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=15986 epoch 011: 338 / 1689 loss=3.808, nll_loss=2.283, ppl=4.87, wps=552535, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=15986 epoch 011: 338 / 1689 loss=3.808, nll_loss=2.283, ppl=4.87, wps=552535, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=15986 epoch 011: 338 / 1689 loss=3.808, nll_loss=2.283, ppl=4.87, wps=552535, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=15986 epoch 011: 338 / 1689 loss=3.808, nll_loss=2.283, ppl=4.87, wps=552535, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=15986 epoch 011: 338 / 1689 loss=3.808, nll_loss=2.283, ppl=4.87, wps=552535, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=15986 epoch 011: 338 / 1689 loss=3.808, nll_loss=2.283, ppl=4.87, wps=552535, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=15986 epoch 011: 338 / 1689 loss=3.808, nll_loss=2.283, ppl=4.87, wps=552535, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=15986 epoch 011: 338 / 1689 loss=3.808, nll_loss=2.283, ppl=4.87, wps=552535, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=15986 epoch 011: 338 / 1689 loss=3.808, nll_loss=2.283, ppl=4.87, wps=552535, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=15986 epoch 011: 338 / 1689 loss=3.808, nll_loss=2.283, ppl=4.87, wps=552535, ups=1.11, wpb=496032, bsz=16920.6, num_updates=17200, lr=0.000482243, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=22.1, wall=15986 epoch 011: 438 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=556201, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16075 epoch 011: 438 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=556201, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16075 epoch 011: 438 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=556201, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16075 epoch 011: 438 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=556201, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16075 epoch 011: 438 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=556201, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16075 epoch 011: 438 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=556201, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16075 epoch 011: 438 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=556201, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16075 epoch 011: 438 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=556201, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16075 epoch 011: 438 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=556201, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16075 epoch 011: 438 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=556201, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16075 epoch 011: 438 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=556201, ups=1.13, wpb=493742, bsz=16387.4, num_updates=17300, lr=0.000480847, gnorm=0.208, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=16075 epoch 011: 538 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=552062, ups=1.12, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=16164 epoch 011: 538 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=552062, ups=1.12, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=16164 epoch 011: 538 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=552062, ups=1.12, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=16164 epoch 011: 538 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=552062, ups=1.12, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=16164 epoch 011: 538 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=552062, ups=1.12, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=16164 epoch 011: 538 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=552062, ups=1.12, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=16164 epoch 011: 538 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=552062, ups=1.12, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=16164 epoch 011: 538 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=552062, ups=1.12, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=16164 epoch 011: 538 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=552062, ups=1.12, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=16164 epoch 011: 538 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=552062, ups=1.12, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=16164 epoch 011: 538 / 1689 loss=3.817, nll_loss=2.293, ppl=4.9, wps=552062, ups=1.12, wpb=493661, bsz=16641.8, num_updates=17400, lr=0.000479463, gnorm=0.204, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=16164 epoch 011: 638 / 1689 loss=3.81, nll_loss=2.285, ppl=4.87, wps=558678, ups=1.12, wpb=496616, bsz=16252.6, num_updates=17500, lr=0.000478091, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=16253 epoch 011: 638 / 1689 loss=3.81, nll_loss=2.285, ppl=4.87, wps=558678, ups=1.12, wpb=496616, bsz=16252.6, num_updates=17500, lr=0.000478091, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=16253 epoch 011: 638 / 1689 loss=3.81, nll_loss=2.285, ppl=4.87, wps=558678, ups=1.12, wpb=496616, bsz=16252.6, num_updates=17500, lr=0.000478091, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=16253 epoch 011: 638 / 1689 loss=3.81, nll_loss=2.285, ppl=4.87, wps=558678, ups=1.12, wpb=496616, bsz=16252.6, num_updates=17500, lr=0.000478091, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=16253 epoch 011: 638 / 1689 loss=3.81, nll_loss=2.285, ppl=4.87, wps=558678, ups=1.12, wpb=496616, bsz=16252.6, num_updates=17500, lr=0.000478091, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=16253 epoch 011: 638 / 1689 loss=3.81, nll_loss=2.285, ppl=4.87, wps=558678, ups=1.12, wpb=496616, bsz=16252.6, num_updates=17500, lr=0.000478091, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=16253 epoch 011: 638 / 1689 loss=3.81, nll_loss=2.285, ppl=4.87, wps=558678, ups=1.12, wpb=496616, bsz=16252.6, num_updates=17500, lr=0.000478091, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=16253 epoch 011: 638 / 1689 loss=3.81, nll_loss=2.285, ppl=4.87, wps=558678, ups=1.12, wpb=496616, bsz=16252.6, num_updates=17500, lr=0.000478091, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=16253 epoch 011: 638 / 1689 loss=3.81, nll_loss=2.285, ppl=4.87, wps=558678, ups=1.12, wpb=496616, bsz=16252.6, num_updates=17500, lr=0.000478091, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=16253 epoch 011: 638 / 1689 loss=3.81, nll_loss=2.285, ppl=4.87, wps=558678, ups=1.12, wpb=496616, bsz=16252.6, num_updates=17500, lr=0.000478091, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=16253 epoch 011: 638 / 1689 loss=3.81, nll_loss=2.285, ppl=4.87, wps=558678, ups=1.12, wpb=496616, bsz=16252.6, num_updates=17500, lr=0.000478091, gnorm=0.203, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=16253 epoch 011: 738 / 1689 loss=3.81, nll_loss=2.286, ppl=4.88, wps=550290, ups=1.11, wpb=494684, bsz=16508.2, num_updates=17600, lr=0.000476731, gnorm=0.212, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=16343 epoch 011: 738 / 1689 loss=3.81, nll_loss=2.286, ppl=4.88, wps=550290, ups=1.11, wpb=494684, bsz=16508.2, num_updates=17600, lr=0.000476731, gnorm=0.212, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=16343 epoch 011: 738 / 1689 loss=3.81, nll_loss=2.286, ppl=4.88, wps=550290, ups=1.11, wpb=494684, bsz=16508.2, num_updates=17600, lr=0.000476731, gnorm=0.212, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=16343 epoch 011: 738 / 1689 loss=3.81, nll_loss=2.286, ppl=4.88, wps=550290, ups=1.11, wpb=494684, bsz=16508.2, num_updates=17600, lr=0.000476731, gnorm=0.212, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=16343 epoch 011: 738 / 1689 loss=3.81, nll_loss=2.286, ppl=4.88, wps=550290, ups=1.11, wpb=494684, bsz=16508.2, num_updates=17600, lr=0.000476731, gnorm=0.212, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=16343 epoch 011: 738 / 1689 loss=3.81, nll_loss=2.286, ppl=4.88, wps=550290, ups=1.11, wpb=494684, bsz=16508.2, num_updates=17600, lr=0.000476731, gnorm=0.212, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=16343 epoch 011: 738 / 1689 loss=3.81, nll_loss=2.286, ppl=4.88, wps=550290, ups=1.11, wpb=494684, bsz=16508.2, num_updates=17600, lr=0.000476731, gnorm=0.212, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=16343 epoch 011: 738 / 1689 loss=3.81, nll_loss=2.286, ppl=4.88, wps=550290, ups=1.11, wpb=494684, bsz=16508.2, num_updates=17600, lr=0.000476731, gnorm=0.212, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=16343 epoch 011: 738 / 1689 loss=3.81, nll_loss=2.286, ppl=4.88, wps=550290, ups=1.11, wpb=494684, bsz=16508.2, num_updates=17600, lr=0.000476731, gnorm=0.212, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=16343 epoch 011: 738 / 1689 loss=3.81, nll_loss=2.286, ppl=4.88, wps=550290, ups=1.11, wpb=494684, bsz=16508.2, num_updates=17600, lr=0.000476731, gnorm=0.212, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=16343 epoch 011: 738 / 1689 loss=3.81, nll_loss=2.286, ppl=4.88, wps=550290, ups=1.11, wpb=494684, bsz=16508.2, num_updates=17600, lr=0.000476731, gnorm=0.212, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=16343 epoch 011: 838 / 1689 loss=3.806, nll_loss=2.281, ppl=4.86, wps=552774, ups=1.11, wpb=495861, bsz=16592.2, num_updates=17700, lr=0.000475383, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=16433 epoch 011: 838 / 1689 loss=3.806, nll_loss=2.281, ppl=4.86, wps=552774, ups=1.11, wpb=495861, bsz=16592.2, num_updates=17700, lr=0.000475383, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=16433 epoch 011: 838 / 1689 loss=3.806, nll_loss=2.281, ppl=4.86, wps=552774, ups=1.11, wpb=495861, bsz=16592.2, num_updates=17700, lr=0.000475383, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=16433 epoch 011: 838 / 1689 loss=3.806, nll_loss=2.281, ppl=4.86, wps=552774, ups=1.11, wpb=495861, bsz=16592.2, num_updates=17700, lr=0.000475383, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=16433 epoch 011: 838 / 1689 loss=3.806, nll_loss=2.281, ppl=4.86, wps=552774, ups=1.11, wpb=495861, bsz=16592.2, num_updates=17700, lr=0.000475383, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=16433 epoch 011: 838 / 1689 loss=3.806, nll_loss=2.281, ppl=4.86, wps=552774, ups=1.11, wpb=495861, bsz=16592.2, num_updates=17700, lr=0.000475383, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=16433 epoch 011: 838 / 1689 loss=3.806, nll_loss=2.281, ppl=4.86, wps=552774, ups=1.11, wpb=495861, bsz=16592.2, num_updates=17700, lr=0.000475383, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=16433 epoch 011: 838 / 1689 loss=3.806, nll_loss=2.281, ppl=4.86, wps=552774, ups=1.11, wpb=495861, bsz=16592.2, num_updates=17700, lr=0.000475383, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=16433 epoch 011: 838 / 1689 loss=3.806, nll_loss=2.281, ppl=4.86, wps=552774, ups=1.11, wpb=495861, bsz=16592.2, num_updates=17700, lr=0.000475383, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=16433 epoch 011: 838 / 1689 loss=3.806, nll_loss=2.281, ppl=4.86, wps=552774, ups=1.11, wpb=495861, bsz=16592.2, num_updates=17700, lr=0.000475383, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=16433 epoch 011: 838 / 1689 loss=3.806, nll_loss=2.281, ppl=4.86, wps=552774, ups=1.11, wpb=495861, bsz=16592.2, num_updates=17700, lr=0.000475383, gnorm=0.205, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=16433 epoch 011: 938 / 1689 loss=3.803, nll_loss=2.277, ppl=4.85, wps=557791, ups=1.12, wpb=496918, bsz=16489, num_updates=17800, lr=0.000474045, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=16522 epoch 011: 938 / 1689 loss=3.803, nll_loss=2.277, ppl=4.85, wps=557791, ups=1.12, wpb=496918, bsz=16489, num_updates=17800, lr=0.000474045, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=16522 epoch 011: 938 / 1689 loss=3.803, nll_loss=2.277, ppl=4.85, wps=557791, ups=1.12, wpb=496918, bsz=16489, num_updates=17800, lr=0.000474045, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=16522 epoch 011: 938 / 1689 loss=3.803, nll_loss=2.277, ppl=4.85, wps=557791, ups=1.12, wpb=496918, bsz=16489, num_updates=17800, lr=0.000474045, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=16522 epoch 011: 938 / 1689 loss=3.803, nll_loss=2.277, ppl=4.85, wps=557791, ups=1.12, wpb=496918, bsz=16489, num_updates=17800, lr=0.000474045, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=16522 epoch 011: 938 / 1689 loss=3.803, nll_loss=2.277, ppl=4.85, wps=557791, ups=1.12, wpb=496918, bsz=16489, num_updates=17800, lr=0.000474045, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=16522 epoch 011: 938 / 1689 loss=3.803, nll_loss=2.277, ppl=4.85, wps=557791, ups=1.12, wpb=496918, bsz=16489, num_updates=17800, lr=0.000474045, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=16522 epoch 011: 938 / 1689 loss=3.803, nll_loss=2.277, ppl=4.85, wps=557791, ups=1.12, wpb=496918, bsz=16489, num_updates=17800, lr=0.000474045, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=16522 epoch 011: 938 / 1689 loss=3.803, nll_loss=2.277, ppl=4.85, wps=557791, ups=1.12, wpb=496918, bsz=16489, num_updates=17800, lr=0.000474045, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=16522 epoch 011: 938 / 1689 loss=3.803, nll_loss=2.277, ppl=4.85, wps=557791, ups=1.12, wpb=496918, bsz=16489, num_updates=17800, lr=0.000474045, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=16522 epoch 011: 938 / 1689 loss=3.803, nll_loss=2.277, ppl=4.85, wps=557791, ups=1.12, wpb=496918, bsz=16489, num_updates=17800, lr=0.000474045, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=20.9, wall=16522 epoch 011: 1038 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=552221, ups=1.12, wpb=493289, bsz=16783, num_updates=17900, lr=0.000472719, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=16611 epoch 011: 1038 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=552221, ups=1.12, wpb=493289, bsz=16783, num_updates=17900, lr=0.000472719, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=16611 epoch 011: 1038 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=552221, ups=1.12, wpb=493289, bsz=16783, num_updates=17900, lr=0.000472719, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=16611 epoch 011: 1038 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=552221, ups=1.12, wpb=493289, bsz=16783, num_updates=17900, lr=0.000472719, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=16611 epoch 011: 1038 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=552221, ups=1.12, wpb=493289, bsz=16783, num_updates=17900, lr=0.000472719, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=16611 epoch 011: 1038 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=552221, ups=1.12, wpb=493289, bsz=16783, num_updates=17900, lr=0.000472719, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=16611 epoch 011: 1038 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=552221, ups=1.12, wpb=493289, bsz=16783, num_updates=17900, lr=0.000472719, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=16611 epoch 011: 1038 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=552221, ups=1.12, wpb=493289, bsz=16783, num_updates=17900, lr=0.000472719, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=16611 epoch 011: 1038 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=552221, ups=1.12, wpb=493289, bsz=16783, num_updates=17900, lr=0.000472719, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=16611 epoch 011: 1038 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=552221, ups=1.12, wpb=493289, bsz=16783, num_updates=17900, lr=0.000472719, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=16611 epoch 011: 1038 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=552221, ups=1.12, wpb=493289, bsz=16783, num_updates=17900, lr=0.000472719, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=16611 epoch 011: 1139 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=544106, ups=1.1, wpb=496095, bsz=16592.9, num_updates=18000, lr=0.000471405, gnorm=0.213, clip=0, loss_scale=2, train_wall=90, gb_free=21.2, wall=16702 epoch 011: 1139 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=544106, ups=1.1, wpb=496095, bsz=16592.9, num_updates=18000, lr=0.000471405, gnorm=0.213, clip=0, loss_scale=2, train_wall=90, gb_free=21.2, wall=16702 epoch 011: 1139 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=544106, ups=1.1, wpb=496095, bsz=16592.9, num_updates=18000, lr=0.000471405, gnorm=0.213, clip=0, loss_scale=2, train_wall=90, gb_free=21.2, wall=16702 epoch 011: 1139 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=544106, ups=1.1, wpb=496095, bsz=16592.9, num_updates=18000, lr=0.000471405, gnorm=0.213, clip=0, loss_scale=2, train_wall=90, gb_free=21.2, wall=16702 epoch 011: 1139 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=544106, ups=1.1, wpb=496095, bsz=16592.9, num_updates=18000, lr=0.000471405, gnorm=0.213, clip=0, loss_scale=2, train_wall=90, gb_free=21.2, wall=16702 epoch 011: 1139 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=544106, ups=1.1, wpb=496095, bsz=16592.9, num_updates=18000, lr=0.000471405, gnorm=0.213, clip=0, loss_scale=2, train_wall=90, gb_free=21.2, wall=16702 epoch 011: 1139 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=544106, ups=1.1, wpb=496095, bsz=16592.9, num_updates=18000, lr=0.000471405, gnorm=0.213, clip=0, loss_scale=2, train_wall=90, gb_free=21.2, wall=16702 epoch 011: 1139 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=544106, ups=1.1, wpb=496095, bsz=16592.9, num_updates=18000, lr=0.000471405, gnorm=0.213, clip=0, loss_scale=2, train_wall=90, gb_free=21.2, wall=16702 epoch 011: 1139 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=544106, ups=1.1, wpb=496095, bsz=16592.9, num_updates=18000, lr=0.000471405, gnorm=0.213, clip=0, loss_scale=2, train_wall=90, gb_free=21.2, wall=16702 epoch 011: 1139 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=544106, ups=1.1, wpb=496095, bsz=16592.9, num_updates=18000, lr=0.000471405, gnorm=0.213, clip=0, loss_scale=2, train_wall=90, gb_free=21.2, wall=16702 epoch 011: 1139 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=544106, ups=1.1, wpb=496095, bsz=16592.9, num_updates=18000, lr=0.000471405, gnorm=0.213, clip=0, loss_scale=2, train_wall=90, gb_free=21.2, wall=16702 begin validation on "valid" subset epoch 011 | valid on 'valid' subset | loss 3.803 | nll_loss 2.231 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.802 epoch 011 | valid on 'valid' subset | loss 3.803 | nll_loss 2.231 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.802 epoch 011 | valid on 'valid' subset | loss 3.803 | nll_loss 2.231 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.802 epoch 011 | valid on 'valid' subset | loss 3.803 | nll_loss 2.231 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.802 epoch 011 | valid on 'valid' subset | loss 3.803 | nll_loss 2.231 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.802 epoch 011 | valid on 'valid' subset | loss 3.803 | nll_loss 2.231 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.802 epoch 011 | valid on 'valid' subset | loss 3.803 | nll_loss 2.231 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.802 epoch 011 | valid on 'valid' subset | loss 3.803 | nll_loss 2.231 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.802 epoch 011 | valid on 'valid' subset | loss 3.803 | nll_loss 2.231 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.802 epoch 011 | valid on 'valid' subset | loss 3.803 | nll_loss 2.231 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.802 epoch 011 | valid on 'valid' subset | loss 3.803 | nll_loss 2.231 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 18000 | best_loss 3.802 epoch 011: 1240 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=483648, ups=0.98, wpb=495201, bsz=16734, num_updates=18100, lr=0.0004701, gnorm=0.211, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=16805 epoch 011: 1240 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=483648, ups=0.98, wpb=495201, bsz=16734, num_updates=18100, lr=0.0004701, gnorm=0.211, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=16805 epoch 011: 1240 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=483648, ups=0.98, wpb=495201, bsz=16734, num_updates=18100, lr=0.0004701, gnorm=0.211, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=16805 epoch 011: 1240 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=483648, ups=0.98, wpb=495201, bsz=16734, num_updates=18100, lr=0.0004701, gnorm=0.211, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=16805 epoch 011: 1240 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=483648, ups=0.98, wpb=495201, bsz=16734, num_updates=18100, lr=0.0004701, gnorm=0.211, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=16805 epoch 011: 1240 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=483648, ups=0.98, wpb=495201, bsz=16734, num_updates=18100, lr=0.0004701, gnorm=0.211, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=16805 epoch 011: 1240 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=483648, ups=0.98, wpb=495201, bsz=16734, num_updates=18100, lr=0.0004701, gnorm=0.211, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=16805 epoch 011: 1240 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=483648, ups=0.98, wpb=495201, bsz=16734, num_updates=18100, lr=0.0004701, gnorm=0.211, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=16805 epoch 011: 1240 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=483648, ups=0.98, wpb=495201, bsz=16734, num_updates=18100, lr=0.0004701, gnorm=0.211, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=16805 epoch 011: 1240 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=483648, ups=0.98, wpb=495201, bsz=16734, num_updates=18100, lr=0.0004701, gnorm=0.211, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=16805 epoch 011: 1240 / 1689 loss=3.807, nll_loss=2.282, ppl=4.86, wps=483648, ups=0.98, wpb=495201, bsz=16734, num_updates=18100, lr=0.0004701, gnorm=0.211, clip=0, loss_scale=1, train_wall=89, gb_free=22.2, wall=16805 epoch 011: 1340 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=556139, ups=1.12, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=16894 epoch 011: 1340 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=556139, ups=1.12, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=16894 epoch 011: 1340 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=556139, ups=1.12, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=16894 epoch 011: 1340 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=556139, ups=1.12, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=16894 epoch 011: 1340 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=556139, ups=1.12, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=16894 epoch 011: 1340 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=556139, ups=1.12, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=16894 epoch 011: 1340 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=556139, ups=1.12, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=16894 epoch 011: 1340 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=556139, ups=1.12, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=16894 epoch 011: 1340 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=556139, ups=1.12, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=16894 epoch 011: 1340 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=556139, ups=1.12, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=16894 epoch 011: 1340 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=556139, ups=1.12, wpb=495931, bsz=16401, num_updates=18200, lr=0.000468807, gnorm=0.202, clip=0, loss_scale=1, train_wall=88, gb_free=21.4, wall=16894 epoch 011: 1440 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=552847, ups=1.12, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=16983 epoch 011: 1440 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=552847, ups=1.12, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=16983 epoch 011: 1440 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=552847, ups=1.12, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=16983 epoch 011: 1440 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=552847, ups=1.12, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=16983 epoch 011: 1440 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=552847, ups=1.12, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=16983 epoch 011: 1440 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=552847, ups=1.12, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=16983 epoch 011: 1440 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=552847, ups=1.12, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=16983 epoch 011: 1440 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=552847, ups=1.12, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=16983 epoch 011: 1440 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=552847, ups=1.12, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=16983 epoch 011: 1440 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=552847, ups=1.12, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=16983 epoch 011: 1440 / 1689 loss=3.811, nll_loss=2.287, ppl=4.88, wps=552847, ups=1.12, wpb=493728, bsz=16125.3, num_updates=18300, lr=0.000467525, gnorm=0.201, clip=0, loss_scale=1, train_wall=88, gb_free=20.5, wall=16983 epoch 011: 1540 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=553235, ups=1.12, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.213, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=17073 epoch 011: 1540 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=553235, ups=1.12, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.213, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=17073 epoch 011: 1540 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=553235, ups=1.12, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.213, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=17073 epoch 011: 1540 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=553235, ups=1.12, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.213, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=17073 epoch 011: 1540 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=553235, ups=1.12, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.213, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=17073 epoch 011: 1540 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=553235, ups=1.12, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.213, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=17073 epoch 011: 1540 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=553235, ups=1.12, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.213, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=17073 epoch 011: 1540 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=553235, ups=1.12, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.213, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=17073 epoch 011: 1540 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=553235, ups=1.12, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.213, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=17073 epoch 011: 1540 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=553235, ups=1.12, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.213, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=17073 epoch 011: 1540 / 1689 loss=3.812, nll_loss=2.288, ppl=4.88, wps=553235, ups=1.12, wpb=495044, bsz=16039.4, num_updates=18400, lr=0.000466252, gnorm=0.213, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=17073 epoch 011: 1640 / 1689 loss=3.805, nll_loss=2.28, ppl=4.86, wps=556142, ups=1.12, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=17162 epoch 011: 1640 / 1689 loss=3.805, nll_loss=2.28, ppl=4.86, wps=556142, ups=1.12, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=17162 epoch 011: 1640 / 1689 loss=3.805, nll_loss=2.28, ppl=4.86, wps=556142, ups=1.12, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=17162 epoch 011: 1640 / 1689 loss=3.805, nll_loss=2.28, ppl=4.86, wps=556142, ups=1.12, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=17162 epoch 011: 1640 / 1689 loss=3.805, nll_loss=2.28, ppl=4.86, wps=556142, ups=1.12, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=17162 epoch 011: 1640 / 1689 loss=3.805, nll_loss=2.28, ppl=4.86, wps=556142, ups=1.12, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=17162 epoch 011: 1640 / 1689 loss=3.805, nll_loss=2.28, ppl=4.86, wps=556142, ups=1.12, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=17162 epoch 011: 1640 / 1689 loss=3.805, nll_loss=2.28, ppl=4.86, wps=556142, ups=1.12, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=17162 epoch 011: 1640 / 1689 loss=3.805, nll_loss=2.28, ppl=4.86, wps=556142, ups=1.12, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=17162 epoch 011: 1640 / 1689 loss=3.805, nll_loss=2.28, ppl=4.86, wps=556142, ups=1.12, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=17162 epoch 011: 1640 / 1689 loss=3.805, nll_loss=2.28, ppl=4.86, wps=556142, ups=1.12, wpb=496810, bsz=16394.2, num_updates=18500, lr=0.000464991, gnorm=0.207, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=17162 end of epoch 11 (average epoch stats below) epoch 011 | loss 3.808 | nll_loss 2.284 | ppl 4.87 | wps 542772 | ups 1.1 | wpb 495118 | bsz 16505.4 | num_updates 18549 | lr 0.000464376 | gnorm 0.205 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 22.7 | wall 17205 epoch 011 | loss 3.808 | nll_loss 2.284 | ppl 4.87 | wps 542772 | ups 1.1 | wpb 495118 | bsz 16505.4 | num_updates 18549 | lr 0.000464376 | gnorm 0.205 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 22.7 | wall 17205 epoch 011 | loss 3.808 | nll_loss 2.284 | ppl 4.87 | wps 542772 | ups 1.1 | wpb 495118 | bsz 16505.4 | num_updates 18549 | lr 0.000464376 | gnorm 0.205 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 22.7 | wall 17205 epoch 011 | loss 3.808 | nll_loss 2.284 | ppl 4.87 | wps 542772 | ups 1.1 | wpb 495118 | bsz 16505.4 | num_updates 18549 | lr 0.000464376 | gnorm 0.205 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 22.7 | wall 17205 epoch 011 | loss 3.808 | nll_loss 2.284 | ppl 4.87 | wps 542772 | ups 1.1 | wpb 495118 | bsz 16505.4 | num_updates 18549 | lr 0.000464376 | gnorm 0.205 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 22.7 | wall 17205 epoch 011 | loss 3.808 | nll_loss 2.284 | ppl 4.87 | wps 542772 | ups 1.1 | wpb 495118 | bsz 16505.4 | num_updates 18549 | lr 0.000464376 | gnorm 0.205 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 22.7 | wall 17205 epoch 011 | loss 3.808 | nll_loss 2.284 | ppl 4.87 | wps 542772 | ups 1.1 | wpb 495118 | bsz 16505.4 | num_updates 18549 | lr 0.000464376 | gnorm 0.205 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 22.7 | wall 17205 epoch 011 | loss 3.808 | nll_loss 2.284 | ppl 4.87 | wps 542772 | ups 1.1 | wpb 495118 | bsz 16505.4 | num_updates 18549 | lr 0.000464376 | gnorm 0.205 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 22.7 | wall 17205 epoch 011 | loss 3.808 | nll_loss 2.284 | ppl 4.87 | wps 542772 | ups 1.1 | wpb 495118 | bsz 16505.4 | num_updates 18549 | lr 0.000464376 | gnorm 0.205 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 22.7 | wall 17205 epoch 011 | loss 3.808 | nll_loss 2.284 | ppl 4.87 | wps 542772 | ups 1.1 | wpb 495118 | bsz 16505.4 | num_updates 18549 | lr 0.000464376 | gnorm 0.205 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 22.7 | wall 17205 epoch 011 | loss 3.808 | nll_loss 2.284 | ppl 4.87 | wps 542772 | ups 1.1 | wpb 495118 | bsz 16505.4 | num_updates 18549 | lr 0.000464376 | gnorm 0.205 | clip 0 | loss_scale 2 | train_wall 1490 | gb_free 22.7 | wall 17205 Start iterating over samples epoch 012: 51 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=545800, ups=1.11, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17252 epoch 012: 51 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=545800, ups=1.11, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17252 epoch 012: 51 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=545800, ups=1.11, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17252 epoch 012: 51 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=545800, ups=1.11, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17252 epoch 012: 51 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=545800, ups=1.11, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17252 epoch 012: 51 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=545800, ups=1.11, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17252 epoch 012: 51 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=545800, ups=1.11, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17252 epoch 012: 51 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=545800, ups=1.11, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17252 epoch 012: 51 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=545800, ups=1.11, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17252 epoch 012: 51 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=545800, ups=1.11, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17252 epoch 012: 51 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=545800, ups=1.11, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17252 epoch 012: 51 / 1689 loss=3.799, nll_loss=2.273, ppl=4.83, wps=545800, ups=1.11, wpb=491031, bsz=16377, num_updates=18600, lr=0.000463739, gnorm=0.196, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=17252 epoch 012: 151 / 1689 loss=3.783, nll_loss=2.255, ppl=4.77, wps=545387, ups=1.1, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.201, clip=0, loss_scale=2, train_wall=90, gb_free=22.4, wall=17343 epoch 012: 151 / 1689 loss=3.783, nll_loss=2.255, ppl=4.77, wps=545387, ups=1.1, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.201, clip=0, loss_scale=2, train_wall=90, gb_free=22.4, wall=17343 epoch 012: 151 / 1689 loss=3.783, nll_loss=2.255, ppl=4.77, wps=545387, ups=1.1, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.201, clip=0, loss_scale=2, train_wall=90, gb_free=22.4, wall=17343 epoch 012: 151 / 1689 loss=3.783, nll_loss=2.255, ppl=4.77, wps=545387, ups=1.1, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.201, clip=0, loss_scale=2, train_wall=90, gb_free=22.4, wall=17343 epoch 012: 151 / 1689 loss=3.783, nll_loss=2.255, ppl=4.77, wps=545387, ups=1.1, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.201, clip=0, loss_scale=2, train_wall=90, gb_free=22.4, wall=17343 epoch 012: 151 / 1689 loss=3.783, nll_loss=2.255, ppl=4.77, wps=545387, ups=1.1, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.201, clip=0, loss_scale=2, train_wall=90, gb_free=22.4, wall=17343 epoch 012: 151 / 1689 loss=3.783, nll_loss=2.255, ppl=4.77, wps=545387, ups=1.1, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.201, clip=0, loss_scale=2, train_wall=90, gb_free=22.4, wall=17343 epoch 012: 151 / 1689 loss=3.783, nll_loss=2.255, ppl=4.77, wps=545387, ups=1.1, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.201, clip=0, loss_scale=2, train_wall=90, gb_free=22.4, wall=17343 epoch 012: 151 / 1689 loss=3.783, nll_loss=2.255, ppl=4.77, wps=545387, ups=1.1, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.201, clip=0, loss_scale=2, train_wall=90, gb_free=22.4, wall=17343 epoch 012: 151 / 1689 loss=3.783, nll_loss=2.255, ppl=4.77, wps=545387, ups=1.1, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.201, clip=0, loss_scale=2, train_wall=90, gb_free=22.4, wall=17343 epoch 012: 151 / 1689 loss=3.783, nll_loss=2.255, ppl=4.77, wps=545387, ups=1.1, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.201, clip=0, loss_scale=2, train_wall=90, gb_free=22.4, wall=17343 epoch 012: 151 / 1689 loss=3.783, nll_loss=2.255, ppl=4.77, wps=545387, ups=1.1, wpb=496225, bsz=16312.6, num_updates=18700, lr=0.000462497, gnorm=0.201, clip=0, loss_scale=2, train_wall=90, gb_free=22.4, wall=17343 epoch 012: 251 / 1689 loss=3.792, nll_loss=2.265, ppl=4.81, wps=545389, ups=1.1, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=17434 epoch 012: 251 / 1689 loss=3.792, nll_loss=2.265, ppl=4.81, wps=545389, ups=1.1, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=17434 epoch 012: 251 / 1689 loss=3.792, nll_loss=2.265, ppl=4.81, wps=545389, ups=1.1, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=17434 epoch 012: 251 / 1689 loss=3.792, nll_loss=2.265, ppl=4.81, wps=545389, ups=1.1, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=17434 epoch 012: 251 / 1689 loss=3.792, nll_loss=2.265, ppl=4.81, wps=545389, ups=1.1, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=17434 epoch 012: 251 / 1689 loss=3.792, nll_loss=2.265, ppl=4.81, wps=545389, ups=1.1, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=17434 epoch 012: 251 / 1689 loss=3.792, nll_loss=2.265, ppl=4.81, wps=545389, ups=1.1, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=17434 epoch 012: 251 / 1689 loss=3.792, nll_loss=2.265, ppl=4.81, wps=545389, ups=1.1, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=17434 epoch 012: 251 / 1689 loss=3.792, nll_loss=2.265, ppl=4.81, wps=545389, ups=1.1, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=17434 epoch 012: 251 / 1689 loss=3.792, nll_loss=2.265, ppl=4.81, wps=545389, ups=1.1, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=17434 epoch 012: 251 / 1689 loss=3.792, nll_loss=2.265, ppl=4.81, wps=545389, ups=1.1, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=17434 epoch 012: 251 / 1689 loss=3.792, nll_loss=2.265, ppl=4.81, wps=545389, ups=1.1, wpb=494887, bsz=16667.9, num_updates=18800, lr=0.000461266, gnorm=0.198, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=17434 epoch 012: 351 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=545854, ups=1.1, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=20.2, wall=17524 epoch 012: 351 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=545854, ups=1.1, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=20.2, wall=17524 epoch 012: 351 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=545854, ups=1.1, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=20.2, wall=17524 epoch 012: 351 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=545854, ups=1.1, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=20.2, wall=17524 epoch 012: 351 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=545854, ups=1.1, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=20.2, wall=17524 epoch 012: 351 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=545854, ups=1.1, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=20.2, wall=17524 epoch 012: 351 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=545854, ups=1.1, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=20.2, wall=17524 epoch 012: 351 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=545854, ups=1.1, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=20.2, wall=17524 epoch 012: 351 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=545854, ups=1.1, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=20.2, wall=17524 epoch 012: 351 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=545854, ups=1.1, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=20.2, wall=17524 epoch 012: 351 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=545854, ups=1.1, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=20.2, wall=17524 epoch 012: 351 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=545854, ups=1.1, wpb=494357, bsz=16474.9, num_updates=18900, lr=0.000460044, gnorm=0.206, clip=0, loss_scale=2, train_wall=89, gb_free=20.2, wall=17524 epoch 012: 451 / 1689 loss=3.796, nll_loss=2.27, ppl=4.82, wps=547716, ups=1.11, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=17614 epoch 012: 451 / 1689 loss=3.796, nll_loss=2.27, ppl=4.82, wps=547716, ups=1.11, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=17614 epoch 012: 451 / 1689 loss=3.796, nll_loss=2.27, ppl=4.82, wps=547716, ups=1.11, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=17614 epoch 012: 451 / 1689 loss=3.796, nll_loss=2.27, ppl=4.82, wps=547716, ups=1.11, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=17614 epoch 012: 451 / 1689 loss=3.796, nll_loss=2.27, ppl=4.82, wps=547716, ups=1.11, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=17614 epoch 012: 451 / 1689 loss=3.796, nll_loss=2.27, ppl=4.82, wps=547716, ups=1.11, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=17614 epoch 012: 451 / 1689 loss=3.796, nll_loss=2.27, ppl=4.82, wps=547716, ups=1.11, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=17614 epoch 012: 451 / 1689 loss=3.796, nll_loss=2.27, ppl=4.82, wps=547716, ups=1.11, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=17614 epoch 012: 451 / 1689 loss=3.796, nll_loss=2.27, ppl=4.82, wps=547716, ups=1.11, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=17614 epoch 012: 451 / 1689 loss=3.796, nll_loss=2.27, ppl=4.82, wps=547716, ups=1.11, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=17614 epoch 012: 451 / 1689 loss=3.796, nll_loss=2.27, ppl=4.82, wps=547716, ups=1.11, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=17614 epoch 012: 451 / 1689 loss=3.796, nll_loss=2.27, ppl=4.82, wps=547716, ups=1.11, wpb=494311, bsz=16398.4, num_updates=19000, lr=0.000458831, gnorm=0.199, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=17614 begin validation on "valid" subset epoch 012 | valid on 'valid' subset | loss 3.8 | nll_loss 2.228 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.8 epoch 012 | valid on 'valid' subset | loss 3.8 | nll_loss 2.228 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.8 epoch 012 | valid on 'valid' subset | loss 3.8 | nll_loss 2.228 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.8 epoch 012 | valid on 'valid' subset | loss 3.8 | nll_loss 2.228 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.8 epoch 012 | valid on 'valid' subset | loss 3.8 | nll_loss 2.228 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.8 epoch 012 | valid on 'valid' subset | loss 3.8 | nll_loss 2.228 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.8 epoch 012 | valid on 'valid' subset | loss 3.8 | nll_loss 2.228 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.8 epoch 012 | valid on 'valid' subset | loss 3.8 | nll_loss 2.228 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.8 epoch 012 | valid on 'valid' subset | loss 3.8 | nll_loss 2.228 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.8 epoch 012 | valid on 'valid' subset | loss 3.8 | nll_loss 2.228 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.8 epoch 012 | valid on 'valid' subset | loss 3.8 | nll_loss 2.228 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.8 epoch 012 | valid on 'valid' subset | loss 3.8 | nll_loss 2.228 | ppl 4.69 | wps 0 | wpb 44526 | bsz 2008 | num_updates 19000 | best_loss 3.8 epoch 012: 551 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=384258, ups=0.78, wpb=495698, bsz=16475.9, num_updates=19100, lr=0.000457629, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=17743 epoch 012: 551 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=384258, ups=0.78, wpb=495698, bsz=16475.9, num_updates=19100, lr=0.000457629, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=17743 epoch 012: 551 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=384258, ups=0.78, wpb=495698, bsz=16475.9, num_updates=19100, lr=0.000457629, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=17743 epoch 012: 551 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=384258, ups=0.78, wpb=495698, bsz=16475.9, num_updates=19100, lr=0.000457629, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=17743 epoch 012: 551 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=384258, ups=0.78, wpb=495698, bsz=16475.9, num_updates=19100, lr=0.000457629, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=17743 epoch 012: 551 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=384258, ups=0.78, wpb=495698, bsz=16475.9, num_updates=19100, lr=0.000457629, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=17743 epoch 012: 551 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=384258, ups=0.78, wpb=495698, bsz=16475.9, num_updates=19100, lr=0.000457629, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=17743 epoch 012: 551 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=384258, ups=0.78, wpb=495698, bsz=16475.9, num_updates=19100, lr=0.000457629, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=17743 epoch 012: 551 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=384258, ups=0.78, wpb=495698, bsz=16475.9, num_updates=19100, lr=0.000457629, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=17743 epoch 012: 551 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=384258, ups=0.78, wpb=495698, bsz=16475.9, num_updates=19100, lr=0.000457629, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=17743 epoch 012: 551 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=384258, ups=0.78, wpb=495698, bsz=16475.9, num_updates=19100, lr=0.000457629, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=17743 epoch 012: 551 / 1689 loss=3.793, nll_loss=2.267, ppl=4.81, wps=384258, ups=0.78, wpb=495698, bsz=16475.9, num_updates=19100, lr=0.000457629, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=17743 epoch 012: 651 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=560405, ups=1.13, wpb=496937, bsz=16718.6, num_updates=19200, lr=0.000456435, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=17832 epoch 012: 651 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=560405, ups=1.13, wpb=496937, bsz=16718.6, num_updates=19200, lr=0.000456435, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=17832 epoch 012: 651 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=560405, ups=1.13, wpb=496937, bsz=16718.6, num_updates=19200, lr=0.000456435, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=17832 epoch 012: 651 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=560405, ups=1.13, wpb=496937, bsz=16718.6, num_updates=19200, lr=0.000456435, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=17832 epoch 012: 651 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=560405, ups=1.13, wpb=496937, bsz=16718.6, num_updates=19200, lr=0.000456435, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=17832 epoch 012: 651 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=560405, ups=1.13, wpb=496937, bsz=16718.6, num_updates=19200, lr=0.000456435, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=17832 epoch 012: 651 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=560405, ups=1.13, wpb=496937, bsz=16718.6, num_updates=19200, lr=0.000456435, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=17832 epoch 012: 651 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=560405, ups=1.13, wpb=496937, bsz=16718.6, num_updates=19200, lr=0.000456435, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=17832 epoch 012: 651 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=560405, ups=1.13, wpb=496937, bsz=16718.6, num_updates=19200, lr=0.000456435, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=17832 epoch 012: 651 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=560405, ups=1.13, wpb=496937, bsz=16718.6, num_updates=19200, lr=0.000456435, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=17832 epoch 012: 651 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=560405, ups=1.13, wpb=496937, bsz=16718.6, num_updates=19200, lr=0.000456435, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=17832 epoch 012: 651 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=560405, ups=1.13, wpb=496937, bsz=16718.6, num_updates=19200, lr=0.000456435, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=17832 epoch 012: 751 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=559508, ups=1.13, wpb=496388, bsz=16486.8, num_updates=19300, lr=0.000455251, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=17921 epoch 012: 751 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=559508, ups=1.13, wpb=496388, bsz=16486.8, num_updates=19300, lr=0.000455251, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=17921 epoch 012: 751 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=559508, ups=1.13, wpb=496388, bsz=16486.8, num_updates=19300, lr=0.000455251, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=17921 epoch 012: 751 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=559508, ups=1.13, wpb=496388, bsz=16486.8, num_updates=19300, lr=0.000455251, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=17921 epoch 012: 751 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=559508, ups=1.13, wpb=496388, bsz=16486.8, num_updates=19300, lr=0.000455251, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=17921 epoch 012: 751 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=559508, ups=1.13, wpb=496388, bsz=16486.8, num_updates=19300, lr=0.000455251, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=17921 epoch 012: 751 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=559508, ups=1.13, wpb=496388, bsz=16486.8, num_updates=19300, lr=0.000455251, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=17921 epoch 012: 751 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=559508, ups=1.13, wpb=496388, bsz=16486.8, num_updates=19300, lr=0.000455251, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=17921 epoch 012: 751 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=559508, ups=1.13, wpb=496388, bsz=16486.8, num_updates=19300, lr=0.000455251, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=17921 epoch 012: 751 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=559508, ups=1.13, wpb=496388, bsz=16486.8, num_updates=19300, lr=0.000455251, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=17921 epoch 012: 751 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=559508, ups=1.13, wpb=496388, bsz=16486.8, num_updates=19300, lr=0.000455251, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=17921 epoch 012: 751 / 1689 loss=3.797, nll_loss=2.271, ppl=4.83, wps=559508, ups=1.13, wpb=496388, bsz=16486.8, num_updates=19300, lr=0.000455251, gnorm=0.21, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=17921 epoch 012: 851 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=554663, ups=1.12, wpb=495583, bsz=16763.2, num_updates=19400, lr=0.000454077, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=18010 epoch 012: 851 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=554663, ups=1.12, wpb=495583, bsz=16763.2, num_updates=19400, lr=0.000454077, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=18010 epoch 012: 851 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=554663, ups=1.12, wpb=495583, bsz=16763.2, num_updates=19400, lr=0.000454077, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=18010 epoch 012: 851 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=554663, ups=1.12, wpb=495583, bsz=16763.2, num_updates=19400, lr=0.000454077, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=18010 epoch 012: 851 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=554663, ups=1.12, wpb=495583, bsz=16763.2, num_updates=19400, lr=0.000454077, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=18010 epoch 012: 851 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=554663, ups=1.12, wpb=495583, bsz=16763.2, num_updates=19400, lr=0.000454077, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=18010 epoch 012: 851 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=554663, ups=1.12, wpb=495583, bsz=16763.2, num_updates=19400, lr=0.000454077, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=18010 epoch 012: 851 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=554663, ups=1.12, wpb=495583, bsz=16763.2, num_updates=19400, lr=0.000454077, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=18010 epoch 012: 851 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=554663, ups=1.12, wpb=495583, bsz=16763.2, num_updates=19400, lr=0.000454077, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=18010 epoch 012: 851 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=554663, ups=1.12, wpb=495583, bsz=16763.2, num_updates=19400, lr=0.000454077, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=18010 epoch 012: 851 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=554663, ups=1.12, wpb=495583, bsz=16763.2, num_updates=19400, lr=0.000454077, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=18010 epoch 012: 851 / 1689 loss=3.787, nll_loss=2.261, ppl=4.79, wps=554663, ups=1.12, wpb=495583, bsz=16763.2, num_updates=19400, lr=0.000454077, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=18010 epoch 012: 951 / 1689 loss=3.798, nll_loss=2.273, ppl=4.83, wps=552506, ups=1.12, wpb=495031, bsz=16629.3, num_updates=19500, lr=0.000452911, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=18100 epoch 012: 951 / 1689 loss=3.798, nll_loss=2.273, ppl=4.83, wps=552506, ups=1.12, wpb=495031, bsz=16629.3, num_updates=19500, lr=0.000452911, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=18100 epoch 012: 951 / 1689 loss=3.798, nll_loss=2.273, ppl=4.83, wps=552506, ups=1.12, wpb=495031, bsz=16629.3, num_updates=19500, lr=0.000452911, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=18100 epoch 012: 951 / 1689 loss=3.798, nll_loss=2.273, ppl=4.83, wps=552506, ups=1.12, wpb=495031, bsz=16629.3, num_updates=19500, lr=0.000452911, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=18100 epoch 012: 951 / 1689 loss=3.798, nll_loss=2.273, ppl=4.83, wps=552506, ups=1.12, wpb=495031, bsz=16629.3, num_updates=19500, lr=0.000452911, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=18100 epoch 012: 951 / 1689 loss=3.798, nll_loss=2.273, ppl=4.83, wps=552506, ups=1.12, wpb=495031, bsz=16629.3, num_updates=19500, lr=0.000452911, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=18100 epoch 012: 951 / 1689 loss=3.798, nll_loss=2.273, ppl=4.83, wps=552506, ups=1.12, wpb=495031, bsz=16629.3, num_updates=19500, lr=0.000452911, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=18100 epoch 012: 951 / 1689 loss=3.798, nll_loss=2.273, ppl=4.83, wps=552506, ups=1.12, wpb=495031, bsz=16629.3, num_updates=19500, lr=0.000452911, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=18100 epoch 012: 951 / 1689 loss=3.798, nll_loss=2.273, ppl=4.83, wps=552506, ups=1.12, wpb=495031, bsz=16629.3, num_updates=19500, lr=0.000452911, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=18100 epoch 012: 951 / 1689 loss=3.798, nll_loss=2.273, ppl=4.83, wps=552506, ups=1.12, wpb=495031, bsz=16629.3, num_updates=19500, lr=0.000452911, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=18100 epoch 012: 951 / 1689 loss=3.798, nll_loss=2.273, ppl=4.83, wps=552506, ups=1.12, wpb=495031, bsz=16629.3, num_updates=19500, lr=0.000452911, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=18100 epoch 012: 951 / 1689 loss=3.798, nll_loss=2.273, ppl=4.83, wps=552506, ups=1.12, wpb=495031, bsz=16629.3, num_updates=19500, lr=0.000452911, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=18100 epoch 012: 1052 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=547381, ups=1.1, wpb=495575, bsz=16062.1, num_updates=19600, lr=0.000451754, gnorm=0.198, clip=0, loss_scale=4, train_wall=89, gb_free=22.6, wall=18190 epoch 012: 1052 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=547381, ups=1.1, wpb=495575, bsz=16062.1, num_updates=19600, lr=0.000451754, gnorm=0.198, clip=0, loss_scale=4, train_wall=89, gb_free=22.6, wall=18190 epoch 012: 1052 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=547381, ups=1.1, wpb=495575, bsz=16062.1, num_updates=19600, lr=0.000451754, gnorm=0.198, clip=0, loss_scale=4, train_wall=89, gb_free=22.6, wall=18190 epoch 012: 1052 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=547381, ups=1.1, wpb=495575, bsz=16062.1, num_updates=19600, lr=0.000451754, gnorm=0.198, clip=0, loss_scale=4, train_wall=89, gb_free=22.6, wall=18190 epoch 012: 1052 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=547381, ups=1.1, wpb=495575, bsz=16062.1, num_updates=19600, lr=0.000451754, gnorm=0.198, clip=0, loss_scale=4, train_wall=89, gb_free=22.6, wall=18190 epoch 012: 1052 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=547381, ups=1.1, wpb=495575, bsz=16062.1, num_updates=19600, lr=0.000451754, gnorm=0.198, clip=0, loss_scale=4, train_wall=89, gb_free=22.6, wall=18190 epoch 012: 1052 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=547381, ups=1.1, wpb=495575, bsz=16062.1, num_updates=19600, lr=0.000451754, gnorm=0.198, clip=0, loss_scale=4, train_wall=89, gb_free=22.6, wall=18190 epoch 012: 1052 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=547381, ups=1.1, wpb=495575, bsz=16062.1, num_updates=19600, lr=0.000451754, gnorm=0.198, clip=0, loss_scale=4, train_wall=89, gb_free=22.6, wall=18190 epoch 012: 1052 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=547381, ups=1.1, wpb=495575, bsz=16062.1, num_updates=19600, lr=0.000451754, gnorm=0.198, clip=0, loss_scale=4, train_wall=89, gb_free=22.6, wall=18190 epoch 012: 1052 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=547381, ups=1.1, wpb=495575, bsz=16062.1, num_updates=19600, lr=0.000451754, gnorm=0.198, clip=0, loss_scale=4, train_wall=89, gb_free=22.6, wall=18190 epoch 012: 1052 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=547381, ups=1.1, wpb=495575, bsz=16062.1, num_updates=19600, lr=0.000451754, gnorm=0.198, clip=0, loss_scale=4, train_wall=89, gb_free=22.6, wall=18190 epoch 012: 1052 / 1689 loss=3.794, nll_loss=2.268, ppl=4.82, wps=547381, ups=1.1, wpb=495575, bsz=16062.1, num_updates=19600, lr=0.000451754, gnorm=0.198, clip=0, loss_scale=4, train_wall=89, gb_free=22.6, wall=18190 epoch 012: 1152 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=552013, ups=1.11, wpb=495843, bsz=16435.8, num_updates=19700, lr=0.000450606, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=18280 epoch 012: 1152 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=552013, ups=1.11, wpb=495843, bsz=16435.8, num_updates=19700, lr=0.000450606, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=18280 epoch 012: 1152 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=552013, ups=1.11, wpb=495843, bsz=16435.8, num_updates=19700, lr=0.000450606, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=18280 epoch 012: 1152 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=552013, ups=1.11, wpb=495843, bsz=16435.8, num_updates=19700, lr=0.000450606, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=18280 epoch 012: 1152 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=552013, ups=1.11, wpb=495843, bsz=16435.8, num_updates=19700, lr=0.000450606, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=18280 epoch 012: 1152 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=552013, ups=1.11, wpb=495843, bsz=16435.8, num_updates=19700, lr=0.000450606, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=18280 epoch 012: 1152 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=552013, ups=1.11, wpb=495843, bsz=16435.8, num_updates=19700, lr=0.000450606, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=18280 epoch 012: 1152 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=552013, ups=1.11, wpb=495843, bsz=16435.8, num_updates=19700, lr=0.000450606, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=18280 epoch 012: 1152 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=552013, ups=1.11, wpb=495843, bsz=16435.8, num_updates=19700, lr=0.000450606, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=18280 epoch 012: 1152 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=552013, ups=1.11, wpb=495843, bsz=16435.8, num_updates=19700, lr=0.000450606, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=18280 epoch 012: 1152 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=552013, ups=1.11, wpb=495843, bsz=16435.8, num_updates=19700, lr=0.000450606, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=18280 epoch 012: 1152 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=552013, ups=1.11, wpb=495843, bsz=16435.8, num_updates=19700, lr=0.000450606, gnorm=0.196, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=18280 epoch 012: 1252 / 1689 loss=3.795, nll_loss=2.269, ppl=4.82, wps=546199, ups=1.1, wpb=494580, bsz=16726.7, num_updates=19800, lr=0.000449467, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=18371 epoch 012: 1252 / 1689 loss=3.795, nll_loss=2.269, ppl=4.82, wps=546199, ups=1.1, wpb=494580, bsz=16726.7, num_updates=19800, lr=0.000449467, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=18371 epoch 012: 1252 / 1689 loss=3.795, nll_loss=2.269, ppl=4.82, wps=546199, ups=1.1, wpb=494580, bsz=16726.7, num_updates=19800, lr=0.000449467, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=18371 epoch 012: 1252 / 1689 loss=3.795, nll_loss=2.269, ppl=4.82, wps=546199, ups=1.1, wpb=494580, bsz=16726.7, num_updates=19800, lr=0.000449467, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=18371 epoch 012: 1252 / 1689 loss=3.795, nll_loss=2.269, ppl=4.82, wps=546199, ups=1.1, wpb=494580, bsz=16726.7, num_updates=19800, lr=0.000449467, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=18371 epoch 012: 1252 / 1689 loss=3.795, nll_loss=2.269, ppl=4.82, wps=546199, ups=1.1, wpb=494580, bsz=16726.7, num_updates=19800, lr=0.000449467, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=18371 epoch 012: 1252 / 1689 loss=3.795, nll_loss=2.269, ppl=4.82, wps=546199, ups=1.1, wpb=494580, bsz=16726.7, num_updates=19800, lr=0.000449467, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=18371 epoch 012: 1252 / 1689 loss=3.795, nll_loss=2.269, ppl=4.82, wps=546199, ups=1.1, wpb=494580, bsz=16726.7, num_updates=19800, lr=0.000449467, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=18371 epoch 012: 1252 / 1689 loss=3.795, nll_loss=2.269, ppl=4.82, wps=546199, ups=1.1, wpb=494580, bsz=16726.7, num_updates=19800, lr=0.000449467, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=18371 epoch 012: 1252 / 1689 loss=3.795, nll_loss=2.269, ppl=4.82, wps=546199, ups=1.1, wpb=494580, bsz=16726.7, num_updates=19800, lr=0.000449467, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=18371 epoch 012: 1252 / 1689 loss=3.795, nll_loss=2.269, ppl=4.82, wps=546199, ups=1.1, wpb=494580, bsz=16726.7, num_updates=19800, lr=0.000449467, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=18371 epoch 012: 1252 / 1689 loss=3.795, nll_loss=2.269, ppl=4.82, wps=546199, ups=1.1, wpb=494580, bsz=16726.7, num_updates=19800, lr=0.000449467, gnorm=0.2, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=18371 epoch 012: 1352 / 1689 loss=3.797, nll_loss=2.272, ppl=4.83, wps=543009, ups=1.1, wpb=495122, bsz=16927.9, num_updates=19900, lr=0.000448336, gnorm=0.196, clip=0, loss_scale=4, train_wall=90, gb_free=21.1, wall=18462 epoch 012: 1352 / 1689 loss=3.797, nll_loss=2.272, ppl=4.83, wps=543009, ups=1.1, wpb=495122, bsz=16927.9, num_updates=19900, lr=0.000448336, gnorm=0.196, clip=0, loss_scale=4, train_wall=90, gb_free=21.1, wall=18462 epoch 012: 1352 / 1689 loss=3.797, nll_loss=2.272, ppl=4.83, wps=543009, ups=1.1, wpb=495122, bsz=16927.9, num_updates=19900, lr=0.000448336, gnorm=0.196, clip=0, loss_scale=4, train_wall=90, gb_free=21.1, wall=18462 epoch 012: 1352 / 1689 loss=3.797, nll_loss=2.272, ppl=4.83, wps=543009, ups=1.1, wpb=495122, bsz=16927.9, num_updates=19900, lr=0.000448336, gnorm=0.196, clip=0, loss_scale=4, train_wall=90, gb_free=21.1, wall=18462 epoch 012: 1352 / 1689 loss=3.797, nll_loss=2.272, ppl=4.83, wps=543009, ups=1.1, wpb=495122, bsz=16927.9, num_updates=19900, lr=0.000448336, gnorm=0.196, clip=0, loss_scale=4, train_wall=90, gb_free=21.1, wall=18462 epoch 012: 1352 / 1689 loss=3.797, nll_loss=2.272, ppl=4.83, wps=543009, ups=1.1, wpb=495122, bsz=16927.9, num_updates=19900, lr=0.000448336, gnorm=0.196, clip=0, loss_scale=4, train_wall=90, gb_free=21.1, wall=18462 epoch 012: 1352 / 1689 loss=3.797, nll_loss=2.272, ppl=4.83, wps=543009, ups=1.1, wpb=495122, bsz=16927.9, num_updates=19900, lr=0.000448336, gnorm=0.196, clip=0, loss_scale=4, train_wall=90, gb_free=21.1, wall=18462 epoch 012: 1352 / 1689 loss=3.797, nll_loss=2.272, ppl=4.83, wps=543009, ups=1.1, wpb=495122, bsz=16927.9, num_updates=19900, lr=0.000448336, gnorm=0.196, clip=0, loss_scale=4, train_wall=90, gb_free=21.1, wall=18462 epoch 012: 1352 / 1689 loss=3.797, nll_loss=2.272, ppl=4.83, wps=543009, ups=1.1, wpb=495122, bsz=16927.9, num_updates=19900, lr=0.000448336, gnorm=0.196, clip=0, loss_scale=4, train_wall=90, gb_free=21.1, wall=18462 epoch 012: 1352 / 1689 loss=3.797, nll_loss=2.272, ppl=4.83, wps=543009, ups=1.1, wpb=495122, bsz=16927.9, num_updates=19900, lr=0.000448336, gnorm=0.196, clip=0, loss_scale=4, train_wall=90, gb_free=21.1, wall=18462 epoch 012: 1352 / 1689 loss=3.797, nll_loss=2.272, ppl=4.83, wps=543009, ups=1.1, wpb=495122, bsz=16927.9, num_updates=19900, lr=0.000448336, gnorm=0.196, clip=0, loss_scale=4, train_wall=90, gb_free=21.1, wall=18462 epoch 012: 1352 / 1689 loss=3.797, nll_loss=2.272, ppl=4.83, wps=543009, ups=1.1, wpb=495122, bsz=16927.9, num_updates=19900, lr=0.000448336, gnorm=0.196, clip=0, loss_scale=4, train_wall=90, gb_free=21.1, wall=18462 epoch 012: 1452 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=550861, ups=1.11, wpb=494593, bsz=16294.1, num_updates=20000, lr=0.000447214, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=18552 epoch 012: 1452 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=550861, ups=1.11, wpb=494593, bsz=16294.1, num_updates=20000, lr=0.000447214, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=18552 epoch 012: 1452 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=550861, ups=1.11, wpb=494593, bsz=16294.1, num_updates=20000, lr=0.000447214, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=18552 epoch 012: 1452 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=550861, ups=1.11, wpb=494593, bsz=16294.1, num_updates=20000, lr=0.000447214, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=18552 epoch 012: 1452 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=550861, ups=1.11, wpb=494593, bsz=16294.1, num_updates=20000, lr=0.000447214, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=18552 epoch 012: 1452 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=550861, ups=1.11, wpb=494593, bsz=16294.1, num_updates=20000, lr=0.000447214, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=18552 epoch 012: 1452 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=550861, ups=1.11, wpb=494593, bsz=16294.1, num_updates=20000, lr=0.000447214, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=18552 epoch 012: 1452 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=550861, ups=1.11, wpb=494593, bsz=16294.1, num_updates=20000, lr=0.000447214, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=18552 epoch 012: 1452 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=550861, ups=1.11, wpb=494593, bsz=16294.1, num_updates=20000, lr=0.000447214, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=18552 epoch 012: 1452 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=550861, ups=1.11, wpb=494593, bsz=16294.1, num_updates=20000, lr=0.000447214, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=18552 epoch 012: 1452 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=550861, ups=1.11, wpb=494593, bsz=16294.1, num_updates=20000, lr=0.000447214, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=18552 epoch 012: 1452 / 1689 loss=3.799, nll_loss=2.274, ppl=4.84, wps=550861, ups=1.11, wpb=494593, bsz=16294.1, num_updates=20000, lr=0.000447214, gnorm=0.197, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=18552 begin validation on "valid" subset epoch 012 | valid on 'valid' subset | loss 3.778 | nll_loss 2.211 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.778 epoch 012 | valid on 'valid' subset | loss 3.778 | nll_loss 2.211 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.778 epoch 012 | valid on 'valid' subset | loss 3.778 | nll_loss 2.211 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.778 epoch 012 | valid on 'valid' subset | loss 3.778 | nll_loss 2.211 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.778 epoch 012 | valid on 'valid' subset | loss 3.778 | nll_loss 2.211 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.778 epoch 012 | valid on 'valid' subset | loss 3.778 | nll_loss 2.211 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.778 epoch 012 | valid on 'valid' subset | loss 3.778 | nll_loss 2.211 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.778 epoch 012 | valid on 'valid' subset | loss 3.778 | nll_loss 2.211 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.778 epoch 012 | valid on 'valid' subset | loss 3.778 | nll_loss 2.211 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.778 epoch 012 | valid on 'valid' subset | loss 3.778 | nll_loss 2.211 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.778 epoch 012 | valid on 'valid' subset | loss 3.778 | nll_loss 2.211 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.778 epoch 012 | valid on 'valid' subset | loss 3.778 | nll_loss 2.211 | ppl 4.63 | wps 0 | wpb 44526 | bsz 2008 | num_updates 20000 | best_loss 3.778 epoch 012: 1553 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=414044, ups=0.84, wpb=495389, bsz=16546.2, num_updates=20100, lr=0.0004461, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=21.5, wall=18671 epoch 012: 1553 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=414044, ups=0.84, wpb=495389, bsz=16546.2, num_updates=20100, lr=0.0004461, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=21.5, wall=18671 epoch 012: 1553 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=414044, ups=0.84, wpb=495389, bsz=16546.2, num_updates=20100, lr=0.0004461, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=21.5, wall=18671 epoch 012: 1553 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=414044, ups=0.84, wpb=495389, bsz=16546.2, num_updates=20100, lr=0.0004461, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=21.5, wall=18671 epoch 012: 1553 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=414044, ups=0.84, wpb=495389, bsz=16546.2, num_updates=20100, lr=0.0004461, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=21.5, wall=18671 epoch 012: 1553 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=414044, ups=0.84, wpb=495389, bsz=16546.2, num_updates=20100, lr=0.0004461, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=21.5, wall=18671 epoch 012: 1553 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=414044, ups=0.84, wpb=495389, bsz=16546.2, num_updates=20100, lr=0.0004461, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=21.5, wall=18671 epoch 012: 1553 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=414044, ups=0.84, wpb=495389, bsz=16546.2, num_updates=20100, lr=0.0004461, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=21.5, wall=18671 epoch 012: 1553 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=414044, ups=0.84, wpb=495389, bsz=16546.2, num_updates=20100, lr=0.0004461, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=21.5, wall=18671 epoch 012: 1553 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=414044, ups=0.84, wpb=495389, bsz=16546.2, num_updates=20100, lr=0.0004461, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=21.5, wall=18671 epoch 012: 1553 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=414044, ups=0.84, wpb=495389, bsz=16546.2, num_updates=20100, lr=0.0004461, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=21.5, wall=18671 epoch 012: 1553 / 1689 loss=3.793, nll_loss=2.268, ppl=4.82, wps=414044, ups=0.84, wpb=495389, bsz=16546.2, num_updates=20100, lr=0.0004461, gnorm=0.189, clip=0, loss_scale=4, train_wall=94, gb_free=21.5, wall=18671 epoch 012: 1653 / 1689 loss=3.796, nll_loss=2.271, ppl=4.83, wps=547832, ups=1.1, wpb=496016, bsz=16542, num_updates=20200, lr=0.000444994, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=18762 epoch 012: 1653 / 1689 loss=3.796, nll_loss=2.271, ppl=4.83, wps=547832, ups=1.1, wpb=496016, bsz=16542, num_updates=20200, lr=0.000444994, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=18762 epoch 012: 1653 / 1689 loss=3.796, nll_loss=2.271, ppl=4.83, wps=547832, ups=1.1, wpb=496016, bsz=16542, num_updates=20200, lr=0.000444994, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=18762 epoch 012: 1653 / 1689 loss=3.796, nll_loss=2.271, ppl=4.83, wps=547832, ups=1.1, wpb=496016, bsz=16542, num_updates=20200, lr=0.000444994, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=18762 epoch 012: 1653 / 1689 loss=3.796, nll_loss=2.271, ppl=4.83, wps=547832, ups=1.1, wpb=496016, bsz=16542, num_updates=20200, lr=0.000444994, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=18762 epoch 012: 1653 / 1689 loss=3.796, nll_loss=2.271, ppl=4.83, wps=547832, ups=1.1, wpb=496016, bsz=16542, num_updates=20200, lr=0.000444994, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=18762 epoch 012: 1653 / 1689 loss=3.796, nll_loss=2.271, ppl=4.83, wps=547832, ups=1.1, wpb=496016, bsz=16542, num_updates=20200, lr=0.000444994, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=18762 epoch 012: 1653 / 1689 loss=3.796, nll_loss=2.271, ppl=4.83, wps=547832, ups=1.1, wpb=496016, bsz=16542, num_updates=20200, lr=0.000444994, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=18762 epoch 012: 1653 / 1689 loss=3.796, nll_loss=2.271, ppl=4.83, wps=547832, ups=1.1, wpb=496016, bsz=16542, num_updates=20200, lr=0.000444994, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=18762 epoch 012: 1653 / 1689 loss=3.796, nll_loss=2.271, ppl=4.83, wps=547832, ups=1.1, wpb=496016, bsz=16542, num_updates=20200, lr=0.000444994, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=18762 epoch 012: 1653 / 1689 loss=3.796, nll_loss=2.271, ppl=4.83, wps=547832, ups=1.1, wpb=496016, bsz=16542, num_updates=20200, lr=0.000444994, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=18762 epoch 012: 1653 / 1689 loss=3.796, nll_loss=2.271, ppl=4.83, wps=547832, ups=1.1, wpb=496016, bsz=16542, num_updates=20200, lr=0.000444994, gnorm=0.203, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=18762 end of epoch 12 (average epoch stats below) epoch 012 | loss 3.793 | nll_loss 2.268 | ppl 4.81 | wps 525864 | ups 1.06 | wpb 495108 | bsz 16506.5 | num_updates 20236 | lr 0.000444598 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1503 | gb_free 23 | wall 18794 epoch 012 | loss 3.793 | nll_loss 2.268 | ppl 4.81 | wps 525864 | ups 1.06 | wpb 495108 | bsz 16506.5 | num_updates 20236 | lr 0.000444598 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1503 | gb_free 23 | wall 18794 epoch 012 | loss 3.793 | nll_loss 2.268 | ppl 4.81 | wps 525864 | ups 1.06 | wpb 495108 | bsz 16506.5 | num_updates 20236 | lr 0.000444598 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1503 | gb_free 23 | wall 18794 epoch 012 | loss 3.793 | nll_loss 2.268 | ppl 4.81 | wps 525864 | ups 1.06 | wpb 495108 | bsz 16506.5 | num_updates 20236 | lr 0.000444598 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1503 | gb_free 23 | wall 18794 epoch 012 | loss 3.793 | nll_loss 2.268 | ppl 4.81 | wps 525864 | ups 1.06 | wpb 495108 | bsz 16506.5 | num_updates 20236 | lr 0.000444598 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1503 | gb_free 23 | wall 18794 epoch 012 | loss 3.793 | nll_loss 2.268 | ppl 4.81 | wps 525864 | ups 1.06 | wpb 495108 | bsz 16506.5 | num_updates 20236 | lr 0.000444598 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1503 | gb_free 23 | wall 18794 epoch 012 | loss 3.793 | nll_loss 2.268 | ppl 4.81 | wps 525864 | ups 1.06 | wpb 495108 | bsz 16506.5 | num_updates 20236 | lr 0.000444598 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1503 | gb_free 23 | wall 18794 epoch 012 | loss 3.793 | nll_loss 2.268 | ppl 4.81 | wps 525864 | ups 1.06 | wpb 495108 | bsz 16506.5 | num_updates 20236 | lr 0.000444598 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1503 | gb_free 23 | wall 18794 epoch 012 | loss 3.793 | nll_loss 2.268 | ppl 4.81 | wps 525864 | ups 1.06 | wpb 495108 | bsz 16506.5 | num_updates 20236 | lr 0.000444598 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1503 | gb_free 23 | wall 18794 epoch 012 | loss 3.793 | nll_loss 2.268 | ppl 4.81 | wps 525864 | ups 1.06 | wpb 495108 | bsz 16506.5 | num_updates 20236 | lr 0.000444598 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1503 | gb_free 23 | wall 18794 epoch 012 | loss 3.793 | nll_loss 2.268 | ppl 4.81 | wps 525864 | ups 1.06 | wpb 495108 | bsz 16506.5 | num_updates 20236 | lr 0.000444598 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1503 | gb_free 23 | wall 18794 epoch 012 | loss 3.793 | nll_loss 2.268 | ppl 4.81 | wps 525864 | ups 1.06 | wpb 495108 | bsz 16506.5 | num_updates 20236 | lr 0.000444598 | gnorm 0.198 | clip 0 | loss_scale 4 | train_wall 1503 | gb_free 23 | wall 18794 Start iterating over samples epoch 013: 64 / 1689 loss=3.779, nll_loss=2.251, ppl=4.76, wps=550985, ups=1.12, wpb=491894, bsz=16094.7, num_updates=20300, lr=0.000443897, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=18851 epoch 013: 64 / 1689 loss=3.779, nll_loss=2.251, ppl=4.76, wps=550985, ups=1.12, wpb=491894, bsz=16094.7, num_updates=20300, lr=0.000443897, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=18851 epoch 013: 64 / 1689 loss=3.779, nll_loss=2.251, ppl=4.76, wps=550985, ups=1.12, wpb=491894, bsz=16094.7, num_updates=20300, lr=0.000443897, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=18851 epoch 013: 64 / 1689 loss=3.779, nll_loss=2.251, ppl=4.76, wps=550985, ups=1.12, wpb=491894, bsz=16094.7, num_updates=20300, lr=0.000443897, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=18851 epoch 013: 64 / 1689 loss=3.779, nll_loss=2.251, ppl=4.76, wps=550985, ups=1.12, wpb=491894, bsz=16094.7, num_updates=20300, lr=0.000443897, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=18851 epoch 013: 64 / 1689 loss=3.779, nll_loss=2.251, ppl=4.76, wps=550985, ups=1.12, wpb=491894, bsz=16094.7, num_updates=20300, lr=0.000443897, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=18851 epoch 013: 64 / 1689 loss=3.779, nll_loss=2.251, ppl=4.76, wps=550985, ups=1.12, wpb=491894, bsz=16094.7, num_updates=20300, lr=0.000443897, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=18851 epoch 013: 64 / 1689 loss=3.779, nll_loss=2.251, ppl=4.76, wps=550985, ups=1.12, wpb=491894, bsz=16094.7, num_updates=20300, lr=0.000443897, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=18851 epoch 013: 64 / 1689 loss=3.779, nll_loss=2.251, ppl=4.76, wps=550985, ups=1.12, wpb=491894, bsz=16094.7, num_updates=20300, lr=0.000443897, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=18851 epoch 013: 64 / 1689 loss=3.779, nll_loss=2.251, ppl=4.76, wps=550985, ups=1.12, wpb=491894, bsz=16094.7, num_updates=20300, lr=0.000443897, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=18851 epoch 013: 64 / 1689 loss=3.779, nll_loss=2.251, ppl=4.76, wps=550985, ups=1.12, wpb=491894, bsz=16094.7, num_updates=20300, lr=0.000443897, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=18851 epoch 013: 64 / 1689 loss=3.779, nll_loss=2.251, ppl=4.76, wps=550985, ups=1.12, wpb=491894, bsz=16094.7, num_updates=20300, lr=0.000443897, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=18851 epoch 013: 64 / 1689 loss=3.779, nll_loss=2.251, ppl=4.76, wps=550985, ups=1.12, wpb=491894, bsz=16094.7, num_updates=20300, lr=0.000443897, gnorm=0.207, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=18851 epoch 013: 165 / 1689 loss=3.773, nll_loss=2.244, ppl=4.74, wps=547414, ups=1.11, wpb=495025, bsz=16315.4, num_updates=20400, lr=0.000442807, gnorm=0.206, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=18942 epoch 013: 165 / 1689 loss=3.773, nll_loss=2.244, ppl=4.74, wps=547414, ups=1.11, wpb=495025, bsz=16315.4, num_updates=20400, lr=0.000442807, gnorm=0.206, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=18942 epoch 013: 165 / 1689 loss=3.773, nll_loss=2.244, ppl=4.74, wps=547414, ups=1.11, wpb=495025, bsz=16315.4, num_updates=20400, lr=0.000442807, gnorm=0.206, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=18942 epoch 013: 165 / 1689 loss=3.773, nll_loss=2.244, ppl=4.74, wps=547414, ups=1.11, wpb=495025, bsz=16315.4, num_updates=20400, lr=0.000442807, gnorm=0.206, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=18942 epoch 013: 165 / 1689 loss=3.773, nll_loss=2.244, ppl=4.74, wps=547414, ups=1.11, wpb=495025, bsz=16315.4, num_updates=20400, lr=0.000442807, gnorm=0.206, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=18942 epoch 013: 165 / 1689 loss=3.773, nll_loss=2.244, ppl=4.74, wps=547414, ups=1.11, wpb=495025, bsz=16315.4, num_updates=20400, lr=0.000442807, gnorm=0.206, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=18942 epoch 013: 165 / 1689 loss=3.773, nll_loss=2.244, ppl=4.74, wps=547414, ups=1.11, wpb=495025, bsz=16315.4, num_updates=20400, lr=0.000442807, gnorm=0.206, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=18942 epoch 013: 165 / 1689 loss=3.773, nll_loss=2.244, ppl=4.74, wps=547414, ups=1.11, wpb=495025, bsz=16315.4, num_updates=20400, lr=0.000442807, gnorm=0.206, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=18942 epoch 013: 165 / 1689 loss=3.773, nll_loss=2.244, ppl=4.74, wps=547414, ups=1.11, wpb=495025, bsz=16315.4, num_updates=20400, lr=0.000442807, gnorm=0.206, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=18942 epoch 013: 165 / 1689 loss=3.773, nll_loss=2.244, ppl=4.74, wps=547414, ups=1.11, wpb=495025, bsz=16315.4, num_updates=20400, lr=0.000442807, gnorm=0.206, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=18942 epoch 013: 165 / 1689 loss=3.773, nll_loss=2.244, ppl=4.74, wps=547414, ups=1.11, wpb=495025, bsz=16315.4, num_updates=20400, lr=0.000442807, gnorm=0.206, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=18942 epoch 013: 165 / 1689 loss=3.773, nll_loss=2.244, ppl=4.74, wps=547414, ups=1.11, wpb=495025, bsz=16315.4, num_updates=20400, lr=0.000442807, gnorm=0.206, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=18942 epoch 013: 165 / 1689 loss=3.773, nll_loss=2.244, ppl=4.74, wps=547414, ups=1.11, wpb=495025, bsz=16315.4, num_updates=20400, lr=0.000442807, gnorm=0.206, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=18942 epoch 013: 265 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=554876, ups=1.12, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=19031 epoch 013: 265 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=554876, ups=1.12, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=19031 epoch 013: 265 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=554876, ups=1.12, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=19031 epoch 013: 265 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=554876, ups=1.12, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=19031 epoch 013: 265 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=554876, ups=1.12, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=19031 epoch 013: 265 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=554876, ups=1.12, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=19031 epoch 013: 265 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=554876, ups=1.12, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=19031 epoch 013: 265 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=554876, ups=1.12, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=19031 epoch 013: 265 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=554876, ups=1.12, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=19031 epoch 013: 265 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=554876, ups=1.12, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=19031 epoch 013: 265 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=554876, ups=1.12, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=19031 epoch 013: 265 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=554876, ups=1.12, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=19031 epoch 013: 265 / 1689 loss=3.773, nll_loss=2.245, ppl=4.74, wps=554876, ups=1.12, wpb=494941, bsz=16508.9, num_updates=20500, lr=0.000441726, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=19031 epoch 013: 365 / 1689 loss=3.777, nll_loss=2.249, ppl=4.75, wps=553056, ups=1.12, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=19120 epoch 013: 365 / 1689 loss=3.777, nll_loss=2.249, ppl=4.75, wps=553056, ups=1.12, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=19120 epoch 013: 365 / 1689 loss=3.777, nll_loss=2.249, ppl=4.75, wps=553056, ups=1.12, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=19120 epoch 013: 365 / 1689 loss=3.777, nll_loss=2.249, ppl=4.75, wps=553056, ups=1.12, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=19120 epoch 013: 365 / 1689 loss=3.777, nll_loss=2.249, ppl=4.75, wps=553056, ups=1.12, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=19120 epoch 013: 365 / 1689 loss=3.777, nll_loss=2.249, ppl=4.75, wps=553056, ups=1.12, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=19120 epoch 013: 365 / 1689 loss=3.777, nll_loss=2.249, ppl=4.75, wps=553056, ups=1.12, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=19120 epoch 013: 365 / 1689 loss=3.777, nll_loss=2.249, ppl=4.75, wps=553056, ups=1.12, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=19120 epoch 013: 365 / 1689 loss=3.777, nll_loss=2.249, ppl=4.75, wps=553056, ups=1.12, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=19120 epoch 013: 365 / 1689 loss=3.777, nll_loss=2.249, ppl=4.75, wps=553056, ups=1.12, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=19120 epoch 013: 365 / 1689 loss=3.777, nll_loss=2.249, ppl=4.75, wps=553056, ups=1.12, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=19120 epoch 013: 365 / 1689 loss=3.777, nll_loss=2.249, ppl=4.75, wps=553056, ups=1.12, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=19120 epoch 013: 365 / 1689 loss=3.777, nll_loss=2.249, ppl=4.75, wps=553056, ups=1.12, wpb=494408, bsz=16821.3, num_updates=20600, lr=0.000440653, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=19120 epoch 013: 465 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553136, ups=1.12, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=19209 epoch 013: 465 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553136, ups=1.12, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=19209 epoch 013: 465 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553136, ups=1.12, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=19209 epoch 013: 465 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553136, ups=1.12, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=19209 epoch 013: 465 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553136, ups=1.12, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=19209 epoch 013: 465 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553136, ups=1.12, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=19209 epoch 013: 465 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553136, ups=1.12, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=19209 epoch 013: 465 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553136, ups=1.12, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=19209 epoch 013: 465 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553136, ups=1.12, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=19209 epoch 013: 465 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553136, ups=1.12, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=19209 epoch 013: 465 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553136, ups=1.12, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=19209 epoch 013: 465 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553136, ups=1.12, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=19209 epoch 013: 465 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=553136, ups=1.12, wpb=494060, bsz=16524, num_updates=20700, lr=0.000439587, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=22.3, wall=19209 epoch 013: 565 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=558655, ups=1.13, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21, wall=19298 epoch 013: 565 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=558655, ups=1.13, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21, wall=19298 epoch 013: 565 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=558655, ups=1.13, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21, wall=19298 epoch 013: 565 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=558655, ups=1.13, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21, wall=19298 epoch 013: 565 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=558655, ups=1.13, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21, wall=19298 epoch 013: 565 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=558655, ups=1.13, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21, wall=19298 epoch 013: 565 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=558655, ups=1.13, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21, wall=19298 epoch 013: 565 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=558655, ups=1.13, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21, wall=19298 epoch 013: 565 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=558655, ups=1.13, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21, wall=19298 epoch 013: 565 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=558655, ups=1.13, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21, wall=19298 epoch 013: 565 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=558655, ups=1.13, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21, wall=19298 epoch 013: 565 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=558655, ups=1.13, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21, wall=19298 epoch 013: 565 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=558655, ups=1.13, wpb=495554, bsz=16086.1, num_updates=20800, lr=0.000438529, gnorm=0.185, clip=0, loss_scale=2, train_wall=88, gb_free=21, wall=19298 epoch 013: 665 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=550006, ups=1.11, wpb=496172, bsz=17010.2, num_updates=20900, lr=0.000437479, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=19388 epoch 013: 665 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=550006, ups=1.11, wpb=496172, bsz=17010.2, num_updates=20900, lr=0.000437479, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=19388 epoch 013: 665 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=550006, ups=1.11, wpb=496172, bsz=17010.2, num_updates=20900, lr=0.000437479, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=19388 epoch 013: 665 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=550006, ups=1.11, wpb=496172, bsz=17010.2, num_updates=20900, lr=0.000437479, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=19388 epoch 013: 665 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=550006, ups=1.11, wpb=496172, bsz=17010.2, num_updates=20900, lr=0.000437479, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=19388 epoch 013: 665 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=550006, ups=1.11, wpb=496172, bsz=17010.2, num_updates=20900, lr=0.000437479, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=19388 epoch 013: 665 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=550006, ups=1.11, wpb=496172, bsz=17010.2, num_updates=20900, lr=0.000437479, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=19388 epoch 013: 665 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=550006, ups=1.11, wpb=496172, bsz=17010.2, num_updates=20900, lr=0.000437479, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=19388 epoch 013: 665 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=550006, ups=1.11, wpb=496172, bsz=17010.2, num_updates=20900, lr=0.000437479, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=19388 epoch 013: 665 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=550006, ups=1.11, wpb=496172, bsz=17010.2, num_updates=20900, lr=0.000437479, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=19388 epoch 013: 665 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=550006, ups=1.11, wpb=496172, bsz=17010.2, num_updates=20900, lr=0.000437479, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=19388 epoch 013: 665 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=550006, ups=1.11, wpb=496172, bsz=17010.2, num_updates=20900, lr=0.000437479, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=19388 epoch 013: 665 / 1689 loss=3.782, nll_loss=2.255, ppl=4.77, wps=550006, ups=1.11, wpb=496172, bsz=17010.2, num_updates=20900, lr=0.000437479, gnorm=0.193, clip=0, loss_scale=4, train_wall=89, gb_free=22.3, wall=19388 epoch 013: 765 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=551284, ups=1.11, wpb=495172, bsz=16523.5, num_updates=21000, lr=0.000436436, gnorm=0.198, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19478 epoch 013: 765 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=551284, ups=1.11, wpb=495172, bsz=16523.5, num_updates=21000, lr=0.000436436, gnorm=0.198, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19478 epoch 013: 765 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=551284, ups=1.11, wpb=495172, bsz=16523.5, num_updates=21000, lr=0.000436436, gnorm=0.198, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19478 epoch 013: 765 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=551284, ups=1.11, wpb=495172, bsz=16523.5, num_updates=21000, lr=0.000436436, gnorm=0.198, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19478 epoch 013: 765 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=551284, ups=1.11, wpb=495172, bsz=16523.5, num_updates=21000, lr=0.000436436, gnorm=0.198, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19478 epoch 013: 765 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=551284, ups=1.11, wpb=495172, bsz=16523.5, num_updates=21000, lr=0.000436436, gnorm=0.198, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19478 epoch 013: 765 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=551284, ups=1.11, wpb=495172, bsz=16523.5, num_updates=21000, lr=0.000436436, gnorm=0.198, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19478 epoch 013: 765 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=551284, ups=1.11, wpb=495172, bsz=16523.5, num_updates=21000, lr=0.000436436, gnorm=0.198, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19478 epoch 013: 765 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=551284, ups=1.11, wpb=495172, bsz=16523.5, num_updates=21000, lr=0.000436436, gnorm=0.198, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19478 epoch 013: 765 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=551284, ups=1.11, wpb=495172, bsz=16523.5, num_updates=21000, lr=0.000436436, gnorm=0.198, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19478 epoch 013: 765 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=551284, ups=1.11, wpb=495172, bsz=16523.5, num_updates=21000, lr=0.000436436, gnorm=0.198, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19478 epoch 013: 765 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=551284, ups=1.11, wpb=495172, bsz=16523.5, num_updates=21000, lr=0.000436436, gnorm=0.198, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19478 epoch 013: 765 / 1689 loss=3.785, nll_loss=2.259, ppl=4.79, wps=551284, ups=1.11, wpb=495172, bsz=16523.5, num_updates=21000, lr=0.000436436, gnorm=0.198, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=19478 begin validation on "valid" subset epoch 013 | valid on 'valid' subset | loss 3.783 | nll_loss 2.215 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.778 epoch 013 | valid on 'valid' subset | loss 3.783 | nll_loss 2.215 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.778 epoch 013 | valid on 'valid' subset | loss 3.783 | nll_loss 2.215 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.778 epoch 013 | valid on 'valid' subset | loss 3.783 | nll_loss 2.215 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.778 epoch 013 | valid on 'valid' subset | loss 3.783 | nll_loss 2.215 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.778 epoch 013 | valid on 'valid' subset | loss 3.783 | nll_loss 2.215 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.778 epoch 013 | valid on 'valid' subset | loss 3.783 | nll_loss 2.215 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.778 epoch 013 | valid on 'valid' subset | loss 3.783 | nll_loss 2.215 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.778 epoch 013 | valid on 'valid' subset | loss 3.783 | nll_loss 2.215 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.778 epoch 013 | valid on 'valid' subset | loss 3.783 | nll_loss 2.215 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.778 epoch 013 | valid on 'valid' subset | loss 3.783 | nll_loss 2.215 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.778 epoch 013 | valid on 'valid' subset | loss 3.783 | nll_loss 2.215 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.778 epoch 013 | valid on 'valid' subset | loss 3.783 | nll_loss 2.215 | ppl 4.64 | wps 0 | wpb 44526 | bsz 2008 | num_updates 21000 | best_loss 3.778 epoch 013: 867 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=477243, ups=0.96, wpb=495480, bsz=16752.5, num_updates=21100, lr=0.0004354, gnorm=0.203, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=19582 epoch 013: 867 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=477243, ups=0.96, wpb=495480, bsz=16752.5, num_updates=21100, lr=0.0004354, gnorm=0.203, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=19582 epoch 013: 867 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=477243, ups=0.96, wpb=495480, bsz=16752.5, num_updates=21100, lr=0.0004354, gnorm=0.203, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=19582 epoch 013: 867 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=477243, ups=0.96, wpb=495480, bsz=16752.5, num_updates=21100, lr=0.0004354, gnorm=0.203, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=19582 epoch 013: 867 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=477243, ups=0.96, wpb=495480, bsz=16752.5, num_updates=21100, lr=0.0004354, gnorm=0.203, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=19582 epoch 013: 867 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=477243, ups=0.96, wpb=495480, bsz=16752.5, num_updates=21100, lr=0.0004354, gnorm=0.203, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=19582 epoch 013: 867 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=477243, ups=0.96, wpb=495480, bsz=16752.5, num_updates=21100, lr=0.0004354, gnorm=0.203, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=19582 epoch 013: 867 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=477243, ups=0.96, wpb=495480, bsz=16752.5, num_updates=21100, lr=0.0004354, gnorm=0.203, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=19582 epoch 013: 867 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=477243, ups=0.96, wpb=495480, bsz=16752.5, num_updates=21100, lr=0.0004354, gnorm=0.203, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=19582 epoch 013: 867 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=477243, ups=0.96, wpb=495480, bsz=16752.5, num_updates=21100, lr=0.0004354, gnorm=0.203, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=19582 epoch 013: 867 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=477243, ups=0.96, wpb=495480, bsz=16752.5, num_updates=21100, lr=0.0004354, gnorm=0.203, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=19582 epoch 013: 867 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=477243, ups=0.96, wpb=495480, bsz=16752.5, num_updates=21100, lr=0.0004354, gnorm=0.203, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=19582 epoch 013: 867 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=477243, ups=0.96, wpb=495480, bsz=16752.5, num_updates=21100, lr=0.0004354, gnorm=0.203, clip=0, loss_scale=1, train_wall=90, gb_free=21.7, wall=19582 epoch 013: 967 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=554196, ups=1.12, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=19672 epoch 013: 967 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=554196, ups=1.12, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=19672 epoch 013: 967 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=554196, ups=1.12, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=19672 epoch 013: 967 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=554196, ups=1.12, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=19672 epoch 013: 967 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=554196, ups=1.12, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=19672 epoch 013: 967 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=554196, ups=1.12, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=19672 epoch 013: 967 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=554196, ups=1.12, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=19672 epoch 013: 967 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=554196, ups=1.12, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=19672 epoch 013: 967 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=554196, ups=1.12, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=19672 epoch 013: 967 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=554196, ups=1.12, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=19672 epoch 013: 967 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=554196, ups=1.12, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=19672 epoch 013: 967 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=554196, ups=1.12, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=19672 epoch 013: 967 / 1689 loss=3.775, nll_loss=2.247, ppl=4.75, wps=554196, ups=1.12, wpb=496493, bsz=16500, num_updates=21200, lr=0.000434372, gnorm=0.19, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=19672 epoch 013: 1067 / 1689 loss=3.782, nll_loss=2.256, ppl=4.78, wps=555038, ups=1.12, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=19761 epoch 013: 1067 / 1689 loss=3.782, nll_loss=2.256, ppl=4.78, wps=555038, ups=1.12, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=19761 epoch 013: 1067 / 1689 loss=3.782, nll_loss=2.256, ppl=4.78, wps=555038, ups=1.12, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=19761 epoch 013: 1067 / 1689 loss=3.782, nll_loss=2.256, ppl=4.78, wps=555038, ups=1.12, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=19761 epoch 013: 1067 / 1689 loss=3.782, nll_loss=2.256, ppl=4.78, wps=555038, ups=1.12, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=19761 epoch 013: 1067 / 1689 loss=3.782, nll_loss=2.256, ppl=4.78, wps=555038, ups=1.12, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=19761 epoch 013: 1067 / 1689 loss=3.782, nll_loss=2.256, ppl=4.78, wps=555038, ups=1.12, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=19761 epoch 013: 1067 / 1689 loss=3.782, nll_loss=2.256, ppl=4.78, wps=555038, ups=1.12, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=19761 epoch 013: 1067 / 1689 loss=3.782, nll_loss=2.256, ppl=4.78, wps=555038, ups=1.12, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=19761 epoch 013: 1067 / 1689 loss=3.782, nll_loss=2.256, ppl=4.78, wps=555038, ups=1.12, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=19761 epoch 013: 1067 / 1689 loss=3.782, nll_loss=2.256, ppl=4.78, wps=555038, ups=1.12, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=19761 epoch 013: 1067 / 1689 loss=3.782, nll_loss=2.256, ppl=4.78, wps=555038, ups=1.12, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=19761 epoch 013: 1067 / 1689 loss=3.782, nll_loss=2.256, ppl=4.78, wps=555038, ups=1.12, wpb=495359, bsz=16461.4, num_updates=21300, lr=0.000433351, gnorm=0.205, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=19761 epoch 013: 1167 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=554177, ups=1.12, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=19850 epoch 013: 1167 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=554177, ups=1.12, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=19850 epoch 013: 1167 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=554177, ups=1.12, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=19850 epoch 013: 1167 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=554177, ups=1.12, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=19850 epoch 013: 1167 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=554177, ups=1.12, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=19850 epoch 013: 1167 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=554177, ups=1.12, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=19850 epoch 013: 1167 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=554177, ups=1.12, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=19850 epoch 013: 1167 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=554177, ups=1.12, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=19850 epoch 013: 1167 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=554177, ups=1.12, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=19850 epoch 013: 1167 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=554177, ups=1.12, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=19850 epoch 013: 1167 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=554177, ups=1.12, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=19850 epoch 013: 1167 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=554177, ups=1.12, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=19850 epoch 013: 1167 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=554177, ups=1.12, wpb=494066, bsz=16459.8, num_updates=21400, lr=0.000432338, gnorm=0.202, clip=0, loss_scale=1, train_wall=87, gb_free=21.3, wall=19850 epoch 013: 1267 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556349, ups=1.12, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=19939 epoch 013: 1267 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556349, ups=1.12, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=19939 epoch 013: 1267 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556349, ups=1.12, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=19939 epoch 013: 1267 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556349, ups=1.12, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=19939 epoch 013: 1267 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556349, ups=1.12, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=19939 epoch 013: 1267 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556349, ups=1.12, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=19939 epoch 013: 1267 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556349, ups=1.12, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=19939 epoch 013: 1267 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556349, ups=1.12, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=19939 epoch 013: 1267 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556349, ups=1.12, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=19939 epoch 013: 1267 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556349, ups=1.12, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=19939 epoch 013: 1267 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556349, ups=1.12, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=19939 epoch 013: 1267 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556349, ups=1.12, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=19939 epoch 013: 1267 / 1689 loss=3.786, nll_loss=2.26, ppl=4.79, wps=556349, ups=1.12, wpb=495548, bsz=16229.5, num_updates=21500, lr=0.000431331, gnorm=0.199, clip=0, loss_scale=1, train_wall=87, gb_free=20.9, wall=19939 epoch 013: 1367 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=551078, ups=1.11, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=20029 epoch 013: 1367 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=551078, ups=1.11, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=20029 epoch 013: 1367 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=551078, ups=1.11, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=20029 epoch 013: 1367 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=551078, ups=1.11, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=20029 epoch 013: 1367 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=551078, ups=1.11, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=20029 epoch 013: 1367 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=551078, ups=1.11, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=20029 epoch 013: 1367 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=551078, ups=1.11, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=20029 epoch 013: 1367 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=551078, ups=1.11, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=20029 epoch 013: 1367 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=551078, ups=1.11, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=20029 epoch 013: 1367 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=551078, ups=1.11, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=20029 epoch 013: 1367 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=551078, ups=1.11, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=20029 epoch 013: 1367 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=551078, ups=1.11, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=20029 epoch 013: 1367 / 1689 loss=3.781, nll_loss=2.254, ppl=4.77, wps=551078, ups=1.11, wpb=495047, bsz=16766.9, num_updates=21600, lr=0.000430331, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=20029 epoch 013: 1467 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=553319, ups=1.12, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=20119 epoch 013: 1467 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=553319, ups=1.12, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=20119 epoch 013: 1467 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=553319, ups=1.12, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=20119 epoch 013: 1467 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=553319, ups=1.12, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=20119 epoch 013: 1467 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=553319, ups=1.12, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=20119 epoch 013: 1467 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=553319, ups=1.12, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=20119 epoch 013: 1467 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=553319, ups=1.12, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=20119 epoch 013: 1467 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=553319, ups=1.12, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=20119 epoch 013: 1467 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=553319, ups=1.12, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=20119 epoch 013: 1467 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=553319, ups=1.12, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=20119 epoch 013: 1467 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=553319, ups=1.12, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=20119 epoch 013: 1467 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=553319, ups=1.12, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=20119 epoch 013: 1467 / 1689 loss=3.774, nll_loss=2.246, ppl=4.74, wps=553319, ups=1.12, wpb=495798, bsz=16772.7, num_updates=21700, lr=0.000429339, gnorm=0.191, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=20119 epoch 013: 1567 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=549844, ups=1.11, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=20209 epoch 013: 1567 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=549844, ups=1.11, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=20209 epoch 013: 1567 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=549844, ups=1.11, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=20209 epoch 013: 1567 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=549844, ups=1.11, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=20209 epoch 013: 1567 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=549844, ups=1.11, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=20209 epoch 013: 1567 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=549844, ups=1.11, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=20209 epoch 013: 1567 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=549844, ups=1.11, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=20209 epoch 013: 1567 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=549844, ups=1.11, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=20209 epoch 013: 1567 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=549844, ups=1.11, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=20209 epoch 013: 1567 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=549844, ups=1.11, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=20209 epoch 013: 1567 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=549844, ups=1.11, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=20209 epoch 013: 1567 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=549844, ups=1.11, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=20209 epoch 013: 1567 / 1689 loss=3.789, nll_loss=2.263, ppl=4.8, wps=549844, ups=1.11, wpb=495020, bsz=16463.6, num_updates=21800, lr=0.000428353, gnorm=0.202, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=20209 epoch 013: 1667 / 1689 loss=3.781, nll_loss=2.255, ppl=4.77, wps=552518, ups=1.11, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=20298 epoch 013: 1667 / 1689 loss=3.781, nll_loss=2.255, ppl=4.77, wps=552518, ups=1.11, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=20298 epoch 013: 1667 / 1689 loss=3.781, nll_loss=2.255, ppl=4.77, wps=552518, ups=1.11, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=20298 epoch 013: 1667 / 1689 loss=3.781, nll_loss=2.255, ppl=4.77, wps=552518, ups=1.11, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=20298 epoch 013: 1667 / 1689 loss=3.781, nll_loss=2.255, ppl=4.77, wps=552518, ups=1.11, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=20298 epoch 013: 1667 / 1689 loss=3.781, nll_loss=2.255, ppl=4.77, wps=552518, ups=1.11, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=20298 epoch 013: 1667 / 1689 loss=3.781, nll_loss=2.255, ppl=4.77, wps=552518, ups=1.11, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=20298 epoch 013: 1667 / 1689 loss=3.781, nll_loss=2.255, ppl=4.77, wps=552518, ups=1.11, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=20298 epoch 013: 1667 / 1689 loss=3.781, nll_loss=2.255, ppl=4.77, wps=552518, ups=1.11, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=20298 epoch 013: 1667 / 1689 loss=3.781, nll_loss=2.255, ppl=4.77, wps=552518, ups=1.11, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=20298 epoch 013: 1667 / 1689 loss=3.781, nll_loss=2.255, ppl=4.77, wps=552518, ups=1.11, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=20298 epoch 013: 1667 / 1689 loss=3.781, nll_loss=2.255, ppl=4.77, wps=552518, ups=1.11, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=20298 epoch 013: 1667 / 1689 loss=3.781, nll_loss=2.255, ppl=4.77, wps=552518, ups=1.11, wpb=495612, bsz=16361.4, num_updates=21900, lr=0.000427374, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=20298 end of epoch 13 (average epoch stats below) epoch 013 | loss 3.78 | nll_loss 2.253 | ppl 4.77 | wps 547883 | ups 1.11 | wpb 495104 | bsz 16507.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.195 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 22.9 | wall 20317 epoch 013 | loss 3.78 | nll_loss 2.253 | ppl 4.77 | wps 547883 | ups 1.11 | wpb 495104 | bsz 16507.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.195 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 22.9 | wall 20317 epoch 013 | loss 3.78 | nll_loss 2.253 | ppl 4.77 | wps 547883 | ups 1.11 | wpb 495104 | bsz 16507.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.195 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 22.9 | wall 20317 epoch 013 | loss 3.78 | nll_loss 2.253 | ppl 4.77 | wps 547883 | ups 1.11 | wpb 495104 | bsz 16507.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.195 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 22.9 | wall 20317 epoch 013 | loss 3.78 | nll_loss 2.253 | ppl 4.77 | wps 547883 | ups 1.11 | wpb 495104 | bsz 16507.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.195 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 22.9 | wall 20317 epoch 013 | loss 3.78 | nll_loss 2.253 | ppl 4.77 | wps 547883 | ups 1.11 | wpb 495104 | bsz 16507.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.195 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 22.9 | wall 20317 epoch 013 | loss 3.78 | nll_loss 2.253 | ppl 4.77 | wps 547883 | ups 1.11 | wpb 495104 | bsz 16507.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.195 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 22.9 | wall 20317 epoch 013 | loss 3.78 | nll_loss 2.253 | ppl 4.77 | wps 547883 | ups 1.11 | wpb 495104 | bsz 16507.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.195 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 22.9 | wall 20317 epoch 013 | loss 3.78 | nll_loss 2.253 | ppl 4.77 | wps 547883 | ups 1.11 | wpb 495104 | bsz 16507.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.195 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 22.9 | wall 20317 epoch 013 | loss 3.78 | nll_loss 2.253 | ppl 4.77 | wps 547883 | ups 1.11 | wpb 495104 | bsz 16507.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.195 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 22.9 | wall 20317 epoch 013 | loss 3.78 | nll_loss 2.253 | ppl 4.77 | wps 547883 | ups 1.11 | wpb 495104 | bsz 16507.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.195 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 22.9 | wall 20317 epoch 013 | loss 3.78 | nll_loss 2.253 | ppl 4.77 | wps 547883 | ups 1.11 | wpb 495104 | bsz 16507.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.195 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 22.9 | wall 20317 epoch 013 | loss 3.78 | nll_loss 2.253 | ppl 4.77 | wps 547883 | ups 1.11 | wpb 495104 | bsz 16507.7 | num_updates 21922 | lr 0.000427159 | gnorm 0.195 | clip 0 | loss_scale 2 | train_wall 1487 | gb_free 22.9 | wall 20317 Start iterating over samples epoch 014: 78 / 1689 loss=3.767, nll_loss=2.238, ppl=4.72, wps=542614, ups=1.1, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=20389 epoch 014: 78 / 1689 loss=3.767, nll_loss=2.238, ppl=4.72, wps=542614, ups=1.1, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=20389 epoch 014: 78 / 1689 loss=3.767, nll_loss=2.238, ppl=4.72, wps=542614, ups=1.1, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=20389 epoch 014: 78 / 1689 loss=3.767, nll_loss=2.238, ppl=4.72, wps=542614, ups=1.1, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=20389 epoch 014: 78 / 1689 loss=3.767, nll_loss=2.238, ppl=4.72, wps=542614, ups=1.1, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=20389 epoch 014: 78 / 1689 loss=3.767, nll_loss=2.238, ppl=4.72, wps=542614, ups=1.1, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=20389 epoch 014: 78 / 1689 loss=3.767, nll_loss=2.238, ppl=4.72, wps=542614, ups=1.1, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=20389 epoch 014: 78 / 1689 loss=3.767, nll_loss=2.238, ppl=4.72, wps=542614, ups=1.1, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=20389 epoch 014: 78 / 1689 loss=3.767, nll_loss=2.238, ppl=4.72, wps=542614, ups=1.1, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=20389 epoch 014: 78 / 1689 loss=3.767, nll_loss=2.238, ppl=4.72, wps=542614, ups=1.1, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=20389 epoch 014: 78 / 1689 loss=3.767, nll_loss=2.238, ppl=4.72, wps=542614, ups=1.1, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=20389 epoch 014: 78 / 1689 loss=3.767, nll_loss=2.238, ppl=4.72, wps=542614, ups=1.1, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=20389 epoch 014: 78 / 1689 loss=3.767, nll_loss=2.238, ppl=4.72, wps=542614, ups=1.1, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=20389 epoch 014: 78 / 1689 loss=3.767, nll_loss=2.238, ppl=4.72, wps=542614, ups=1.1, wpb=492370, bsz=16259.4, num_updates=22000, lr=0.000426401, gnorm=0.193, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=20389 begin validation on "valid" subset epoch 014 | valid on 'valid' subset | loss 3.768 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.768 epoch 014 | valid on 'valid' subset | loss 3.768 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.768 epoch 014 | valid on 'valid' subset | loss 3.768 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.768 epoch 014 | valid on 'valid' subset | loss 3.768 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.768 epoch 014 | valid on 'valid' subset | loss 3.768 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.768 epoch 014 | valid on 'valid' subset | loss 3.768 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.768 epoch 014 | valid on 'valid' subset | loss 3.768 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.768 epoch 014 | valid on 'valid' subset | loss 3.768 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.768 epoch 014 | valid on 'valid' subset | loss 3.768 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.768 epoch 014 | valid on 'valid' subset | loss 3.768 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.768 epoch 014 | valid on 'valid' subset | loss 3.768 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.768 epoch 014 | valid on 'valid' subset | loss 3.768 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.768 epoch 014 | valid on 'valid' subset | loss 3.768 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.768 epoch 014 | valid on 'valid' subset | loss 3.768 | nll_loss 2.199 | ppl 4.59 | wps 0 | wpb 44526 | bsz 2008 | num_updates 22000 | best_loss 3.768 epoch 014: 178 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=461644, ups=0.93, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=20497 epoch 014: 178 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=461644, ups=0.93, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=20497 epoch 014: 178 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=461644, ups=0.93, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=20497 epoch 014: 178 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=461644, ups=0.93, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=20497 epoch 014: 178 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=461644, ups=0.93, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=20497 epoch 014: 178 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=461644, ups=0.93, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=20497 epoch 014: 178 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=461644, ups=0.93, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=20497 epoch 014: 178 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=461644, ups=0.93, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=20497 epoch 014: 178 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=461644, ups=0.93, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=20497 epoch 014: 178 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=461644, ups=0.93, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=20497 epoch 014: 178 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=461644, ups=0.93, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=20497 epoch 014: 178 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=461644, ups=0.93, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=20497 epoch 014: 178 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=461644, ups=0.93, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=20497 epoch 014: 178 / 1689 loss=3.757, nll_loss=2.227, ppl=4.68, wps=461644, ups=0.93, wpb=497504, bsz=16527.6, num_updates=22100, lr=0.000425436, gnorm=0.185, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=20497 epoch 014: 278 / 1689 loss=3.764, nll_loss=2.234, ppl=4.71, wps=553363, ups=1.11, wpb=496719, bsz=16347, num_updates=22200, lr=0.000424476, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=20587 epoch 014: 278 / 1689 loss=3.764, nll_loss=2.234, ppl=4.71, wps=553363, ups=1.11, wpb=496719, bsz=16347, num_updates=22200, lr=0.000424476, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=20587 epoch 014: 278 / 1689 loss=3.764, nll_loss=2.234, ppl=4.71, wps=553363, ups=1.11, wpb=496719, bsz=16347, num_updates=22200, lr=0.000424476, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=20587 epoch 014: 278 / 1689 loss=3.764, nll_loss=2.234, ppl=4.71, wps=553363, ups=1.11, wpb=496719, bsz=16347, num_updates=22200, lr=0.000424476, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=20587 epoch 014: 278 / 1689 loss=3.764, nll_loss=2.234, ppl=4.71, wps=553363, ups=1.11, wpb=496719, bsz=16347, num_updates=22200, lr=0.000424476, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=20587 epoch 014: 278 / 1689 loss=3.764, nll_loss=2.234, ppl=4.71, wps=553363, ups=1.11, wpb=496719, bsz=16347, num_updates=22200, lr=0.000424476, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=20587 epoch 014: 278 / 1689 loss=3.764, nll_loss=2.234, ppl=4.71, wps=553363, ups=1.11, wpb=496719, bsz=16347, num_updates=22200, lr=0.000424476, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=20587 epoch 014: 278 / 1689 loss=3.764, nll_loss=2.234, ppl=4.71, wps=553363, ups=1.11, wpb=496719, bsz=16347, num_updates=22200, lr=0.000424476, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=20587 epoch 014: 278 / 1689 loss=3.764, nll_loss=2.234, ppl=4.71, wps=553363, ups=1.11, wpb=496719, bsz=16347, num_updates=22200, lr=0.000424476, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=20587 epoch 014: 278 / 1689 loss=3.764, nll_loss=2.234, ppl=4.71, wps=553363, ups=1.11, wpb=496719, bsz=16347, num_updates=22200, lr=0.000424476, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=20587 epoch 014: 278 / 1689 loss=3.764, nll_loss=2.234, ppl=4.71, wps=553363, ups=1.11, wpb=496719, bsz=16347, num_updates=22200, lr=0.000424476, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=20587 epoch 014: 278 / 1689 loss=3.764, nll_loss=2.234, ppl=4.71, wps=553363, ups=1.11, wpb=496719, bsz=16347, num_updates=22200, lr=0.000424476, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=20587 epoch 014: 278 / 1689 loss=3.764, nll_loss=2.234, ppl=4.71, wps=553363, ups=1.11, wpb=496719, bsz=16347, num_updates=22200, lr=0.000424476, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=20587 epoch 014: 278 / 1689 loss=3.764, nll_loss=2.234, ppl=4.71, wps=553363, ups=1.11, wpb=496719, bsz=16347, num_updates=22200, lr=0.000424476, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=20.5, wall=20587 epoch 014: 378 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=550992, ups=1.11, wpb=495921, bsz=16753, num_updates=22300, lr=0.000423524, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20677 epoch 014: 378 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=550992, ups=1.11, wpb=495921, bsz=16753, num_updates=22300, lr=0.000423524, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20677 epoch 014: 378 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=550992, ups=1.11, wpb=495921, bsz=16753, num_updates=22300, lr=0.000423524, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20677 epoch 014: 378 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=550992, ups=1.11, wpb=495921, bsz=16753, num_updates=22300, lr=0.000423524, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20677 epoch 014: 378 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=550992, ups=1.11, wpb=495921, bsz=16753, num_updates=22300, lr=0.000423524, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20677 epoch 014: 378 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=550992, ups=1.11, wpb=495921, bsz=16753, num_updates=22300, lr=0.000423524, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20677 epoch 014: 378 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=550992, ups=1.11, wpb=495921, bsz=16753, num_updates=22300, lr=0.000423524, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20677 epoch 014: 378 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=550992, ups=1.11, wpb=495921, bsz=16753, num_updates=22300, lr=0.000423524, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20677 epoch 014: 378 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=550992, ups=1.11, wpb=495921, bsz=16753, num_updates=22300, lr=0.000423524, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20677 epoch 014: 378 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=550992, ups=1.11, wpb=495921, bsz=16753, num_updates=22300, lr=0.000423524, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20677 epoch 014: 378 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=550992, ups=1.11, wpb=495921, bsz=16753, num_updates=22300, lr=0.000423524, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20677 epoch 014: 378 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=550992, ups=1.11, wpb=495921, bsz=16753, num_updates=22300, lr=0.000423524, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20677 epoch 014: 378 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=550992, ups=1.11, wpb=495921, bsz=16753, num_updates=22300, lr=0.000423524, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20677 epoch 014: 378 / 1689 loss=3.765, nll_loss=2.237, ppl=4.71, wps=550992, ups=1.11, wpb=495921, bsz=16753, num_updates=22300, lr=0.000423524, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=20677 epoch 014: 478 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=549359, ups=1.11, wpb=495147, bsz=16538.2, num_updates=22400, lr=0.000422577, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20767 epoch 014: 478 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=549359, ups=1.11, wpb=495147, bsz=16538.2, num_updates=22400, lr=0.000422577, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20767 epoch 014: 478 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=549359, ups=1.11, wpb=495147, bsz=16538.2, num_updates=22400, lr=0.000422577, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20767 epoch 014: 478 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=549359, ups=1.11, wpb=495147, bsz=16538.2, num_updates=22400, lr=0.000422577, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20767 epoch 014: 478 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=549359, ups=1.11, wpb=495147, bsz=16538.2, num_updates=22400, lr=0.000422577, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20767 epoch 014: 478 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=549359, ups=1.11, wpb=495147, bsz=16538.2, num_updates=22400, lr=0.000422577, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20767 epoch 014: 478 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=549359, ups=1.11, wpb=495147, bsz=16538.2, num_updates=22400, lr=0.000422577, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20767 epoch 014: 478 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=549359, ups=1.11, wpb=495147, bsz=16538.2, num_updates=22400, lr=0.000422577, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20767 epoch 014: 478 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=549359, ups=1.11, wpb=495147, bsz=16538.2, num_updates=22400, lr=0.000422577, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20767 epoch 014: 478 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=549359, ups=1.11, wpb=495147, bsz=16538.2, num_updates=22400, lr=0.000422577, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20767 epoch 014: 478 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=549359, ups=1.11, wpb=495147, bsz=16538.2, num_updates=22400, lr=0.000422577, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20767 epoch 014: 478 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=549359, ups=1.11, wpb=495147, bsz=16538.2, num_updates=22400, lr=0.000422577, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20767 epoch 014: 478 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=549359, ups=1.11, wpb=495147, bsz=16538.2, num_updates=22400, lr=0.000422577, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20767 epoch 014: 478 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=549359, ups=1.11, wpb=495147, bsz=16538.2, num_updates=22400, lr=0.000422577, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=20767 epoch 014: 578 / 1689 loss=3.769, nll_loss=2.241, ppl=4.73, wps=556365, ups=1.12, wpb=495228, bsz=16475.4, num_updates=22500, lr=0.000421637, gnorm=0.197, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=20856 epoch 014: 578 / 1689 loss=3.769, nll_loss=2.241, ppl=4.73, wps=556365, ups=1.12, wpb=495228, bsz=16475.4, num_updates=22500, lr=0.000421637, gnorm=0.197, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=20856 epoch 014: 578 / 1689 loss=3.769, nll_loss=2.241, ppl=4.73, wps=556365, ups=1.12, wpb=495228, bsz=16475.4, num_updates=22500, lr=0.000421637, gnorm=0.197, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=20856 epoch 014: 578 / 1689 loss=3.769, nll_loss=2.241, ppl=4.73, wps=556365, ups=1.12, wpb=495228, bsz=16475.4, num_updates=22500, lr=0.000421637, gnorm=0.197, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=20856 epoch 014: 578 / 1689 loss=3.769, nll_loss=2.241, ppl=4.73, wps=556365, ups=1.12, wpb=495228, bsz=16475.4, num_updates=22500, lr=0.000421637, gnorm=0.197, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=20856 epoch 014: 578 / 1689 loss=3.769, nll_loss=2.241, ppl=4.73, wps=556365, ups=1.12, wpb=495228, bsz=16475.4, num_updates=22500, lr=0.000421637, gnorm=0.197, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=20856 epoch 014: 578 / 1689 loss=3.769, nll_loss=2.241, ppl=4.73, wps=556365, ups=1.12, wpb=495228, bsz=16475.4, num_updates=22500, lr=0.000421637, gnorm=0.197, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=20856 epoch 014: 578 / 1689 loss=3.769, nll_loss=2.241, ppl=4.73, wps=556365, ups=1.12, wpb=495228, bsz=16475.4, num_updates=22500, lr=0.000421637, gnorm=0.197, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=20856 epoch 014: 578 / 1689 loss=3.769, nll_loss=2.241, ppl=4.73, wps=556365, ups=1.12, wpb=495228, bsz=16475.4, num_updates=22500, lr=0.000421637, gnorm=0.197, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=20856 epoch 014: 578 / 1689 loss=3.769, nll_loss=2.241, ppl=4.73, wps=556365, ups=1.12, wpb=495228, bsz=16475.4, num_updates=22500, lr=0.000421637, gnorm=0.197, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=20856 epoch 014: 578 / 1689 loss=3.769, nll_loss=2.241, ppl=4.73, wps=556365, ups=1.12, wpb=495228, bsz=16475.4, num_updates=22500, lr=0.000421637, gnorm=0.197, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=20856 epoch 014: 578 / 1689 loss=3.769, nll_loss=2.241, ppl=4.73, wps=556365, ups=1.12, wpb=495228, bsz=16475.4, num_updates=22500, lr=0.000421637, gnorm=0.197, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=20856 epoch 014: 578 / 1689 loss=3.769, nll_loss=2.241, ppl=4.73, wps=556365, ups=1.12, wpb=495228, bsz=16475.4, num_updates=22500, lr=0.000421637, gnorm=0.197, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=20856 epoch 014: 578 / 1689 loss=3.769, nll_loss=2.241, ppl=4.73, wps=556365, ups=1.12, wpb=495228, bsz=16475.4, num_updates=22500, lr=0.000421637, gnorm=0.197, clip=0, loss_scale=4, train_wall=87, gb_free=22, wall=20856 epoch 014: 679 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=547096, ups=1.11, wpb=493957, bsz=16260.9, num_updates=22600, lr=0.000420703, gnorm=0.184, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=20946 epoch 014: 679 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=547096, ups=1.11, wpb=493957, bsz=16260.9, num_updates=22600, lr=0.000420703, gnorm=0.184, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=20946 epoch 014: 679 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=547096, ups=1.11, wpb=493957, bsz=16260.9, num_updates=22600, lr=0.000420703, gnorm=0.184, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=20946 epoch 014: 679 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=547096, ups=1.11, wpb=493957, bsz=16260.9, num_updates=22600, lr=0.000420703, gnorm=0.184, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=20946 epoch 014: 679 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=547096, ups=1.11, wpb=493957, bsz=16260.9, num_updates=22600, lr=0.000420703, gnorm=0.184, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=20946 epoch 014: 679 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=547096, ups=1.11, wpb=493957, bsz=16260.9, num_updates=22600, lr=0.000420703, gnorm=0.184, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=20946 epoch 014: 679 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=547096, ups=1.11, wpb=493957, bsz=16260.9, num_updates=22600, lr=0.000420703, gnorm=0.184, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=20946 epoch 014: 679 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=547096, ups=1.11, wpb=493957, bsz=16260.9, num_updates=22600, lr=0.000420703, gnorm=0.184, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=20946 epoch 014: 679 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=547096, ups=1.11, wpb=493957, bsz=16260.9, num_updates=22600, lr=0.000420703, gnorm=0.184, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=20946 epoch 014: 679 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=547096, ups=1.11, wpb=493957, bsz=16260.9, num_updates=22600, lr=0.000420703, gnorm=0.184, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=20946 epoch 014: 679 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=547096, ups=1.11, wpb=493957, bsz=16260.9, num_updates=22600, lr=0.000420703, gnorm=0.184, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=20946 epoch 014: 679 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=547096, ups=1.11, wpb=493957, bsz=16260.9, num_updates=22600, lr=0.000420703, gnorm=0.184, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=20946 epoch 014: 679 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=547096, ups=1.11, wpb=493957, bsz=16260.9, num_updates=22600, lr=0.000420703, gnorm=0.184, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=20946 epoch 014: 679 / 1689 loss=3.77, nll_loss=2.242, ppl=4.73, wps=547096, ups=1.11, wpb=493957, bsz=16260.9, num_updates=22600, lr=0.000420703, gnorm=0.184, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=20946 epoch 014: 779 / 1689 loss=3.766, nll_loss=2.237, ppl=4.72, wps=553397, ups=1.12, wpb=494910, bsz=16442.7, num_updates=22700, lr=0.000419775, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=21035 epoch 014: 779 / 1689 loss=3.766, nll_loss=2.237, ppl=4.72, wps=553397, ups=1.12, wpb=494910, bsz=16442.7, num_updates=22700, lr=0.000419775, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=21035 epoch 014: 779 / 1689 loss=3.766, nll_loss=2.237, ppl=4.72, wps=553397, ups=1.12, wpb=494910, bsz=16442.7, num_updates=22700, lr=0.000419775, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=21035 epoch 014: 779 / 1689 loss=3.766, nll_loss=2.237, ppl=4.72, wps=553397, ups=1.12, wpb=494910, bsz=16442.7, num_updates=22700, lr=0.000419775, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=21035 epoch 014: 779 / 1689 loss=3.766, nll_loss=2.237, ppl=4.72, wps=553397, ups=1.12, wpb=494910, bsz=16442.7, num_updates=22700, lr=0.000419775, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=21035 epoch 014: 779 / 1689 loss=3.766, nll_loss=2.237, ppl=4.72, wps=553397, ups=1.12, wpb=494910, bsz=16442.7, num_updates=22700, lr=0.000419775, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=21035 epoch 014: 779 / 1689 loss=3.766, nll_loss=2.237, ppl=4.72, wps=553397, ups=1.12, wpb=494910, bsz=16442.7, num_updates=22700, lr=0.000419775, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=21035 epoch 014: 779 / 1689 loss=3.766, nll_loss=2.237, ppl=4.72, wps=553397, ups=1.12, wpb=494910, bsz=16442.7, num_updates=22700, lr=0.000419775, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=21035 epoch 014: 779 / 1689 loss=3.766, nll_loss=2.237, ppl=4.72, wps=553397, ups=1.12, wpb=494910, bsz=16442.7, num_updates=22700, lr=0.000419775, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=21035 epoch 014: 779 / 1689 loss=3.766, nll_loss=2.237, ppl=4.72, wps=553397, ups=1.12, wpb=494910, bsz=16442.7, num_updates=22700, lr=0.000419775, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=21035 epoch 014: 779 / 1689 loss=3.766, nll_loss=2.237, ppl=4.72, wps=553397, ups=1.12, wpb=494910, bsz=16442.7, num_updates=22700, lr=0.000419775, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=21035 epoch 014: 779 / 1689 loss=3.766, nll_loss=2.237, ppl=4.72, wps=553397, ups=1.12, wpb=494910, bsz=16442.7, num_updates=22700, lr=0.000419775, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=21035 epoch 014: 779 / 1689 loss=3.766, nll_loss=2.237, ppl=4.72, wps=553397, ups=1.12, wpb=494910, bsz=16442.7, num_updates=22700, lr=0.000419775, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=21035 epoch 014: 779 / 1689 loss=3.766, nll_loss=2.237, ppl=4.72, wps=553397, ups=1.12, wpb=494910, bsz=16442.7, num_updates=22700, lr=0.000419775, gnorm=0.193, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=21035 epoch 014: 879 / 1689 loss=3.768, nll_loss=2.24, ppl=4.72, wps=552300, ups=1.11, wpb=495524, bsz=16659.3, num_updates=22800, lr=0.000418854, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=21125 epoch 014: 879 / 1689 loss=3.768, nll_loss=2.24, ppl=4.72, wps=552300, ups=1.11, wpb=495524, bsz=16659.3, num_updates=22800, lr=0.000418854, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=21125 epoch 014: 879 / 1689 loss=3.768, nll_loss=2.24, ppl=4.72, wps=552300, ups=1.11, wpb=495524, bsz=16659.3, num_updates=22800, lr=0.000418854, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=21125 epoch 014: 879 / 1689 loss=3.768, nll_loss=2.24, ppl=4.72, wps=552300, ups=1.11, wpb=495524, bsz=16659.3, num_updates=22800, lr=0.000418854, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=21125 epoch 014: 879 / 1689 loss=3.768, nll_loss=2.24, ppl=4.72, wps=552300, ups=1.11, wpb=495524, bsz=16659.3, num_updates=22800, lr=0.000418854, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=21125 epoch 014: 879 / 1689 loss=3.768, nll_loss=2.24, ppl=4.72, wps=552300, ups=1.11, wpb=495524, bsz=16659.3, num_updates=22800, lr=0.000418854, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=21125 epoch 014: 879 / 1689 loss=3.768, nll_loss=2.24, ppl=4.72, wps=552300, ups=1.11, wpb=495524, bsz=16659.3, num_updates=22800, lr=0.000418854, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=21125 epoch 014: 879 / 1689 loss=3.768, nll_loss=2.24, ppl=4.72, wps=552300, ups=1.11, wpb=495524, bsz=16659.3, num_updates=22800, lr=0.000418854, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=21125 epoch 014: 879 / 1689 loss=3.768, nll_loss=2.24, ppl=4.72, wps=552300, ups=1.11, wpb=495524, bsz=16659.3, num_updates=22800, lr=0.000418854, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=21125 epoch 014: 879 / 1689 loss=3.768, nll_loss=2.24, ppl=4.72, wps=552300, ups=1.11, wpb=495524, bsz=16659.3, num_updates=22800, lr=0.000418854, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=21125 epoch 014: 879 / 1689 loss=3.768, nll_loss=2.24, ppl=4.72, wps=552300, ups=1.11, wpb=495524, bsz=16659.3, num_updates=22800, lr=0.000418854, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=21125 epoch 014: 879 / 1689 loss=3.768, nll_loss=2.24, ppl=4.72, wps=552300, ups=1.11, wpb=495524, bsz=16659.3, num_updates=22800, lr=0.000418854, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=21125 epoch 014: 879 / 1689 loss=3.768, nll_loss=2.24, ppl=4.72, wps=552300, ups=1.11, wpb=495524, bsz=16659.3, num_updates=22800, lr=0.000418854, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=21125 epoch 014: 879 / 1689 loss=3.768, nll_loss=2.24, ppl=4.72, wps=552300, ups=1.11, wpb=495524, bsz=16659.3, num_updates=22800, lr=0.000418854, gnorm=0.192, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=21125 epoch 014: 979 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=544871, ups=1.1, wpb=494556, bsz=16733, num_updates=22900, lr=0.000417938, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=21216 epoch 014: 979 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=544871, ups=1.1, wpb=494556, bsz=16733, num_updates=22900, lr=0.000417938, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=21216 epoch 014: 979 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=544871, ups=1.1, wpb=494556, bsz=16733, num_updates=22900, lr=0.000417938, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=21216 epoch 014: 979 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=544871, ups=1.1, wpb=494556, bsz=16733, num_updates=22900, lr=0.000417938, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=21216 epoch 014: 979 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=544871, ups=1.1, wpb=494556, bsz=16733, num_updates=22900, lr=0.000417938, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=21216 epoch 014: 979 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=544871, ups=1.1, wpb=494556, bsz=16733, num_updates=22900, lr=0.000417938, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=21216 epoch 014: 979 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=544871, ups=1.1, wpb=494556, bsz=16733, num_updates=22900, lr=0.000417938, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=21216 epoch 014: 979 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=544871, ups=1.1, wpb=494556, bsz=16733, num_updates=22900, lr=0.000417938, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=21216 epoch 014: 979 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=544871, ups=1.1, wpb=494556, bsz=16733, num_updates=22900, lr=0.000417938, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=21216 epoch 014: 979 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=544871, ups=1.1, wpb=494556, bsz=16733, num_updates=22900, lr=0.000417938, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=21216 epoch 014: 979 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=544871, ups=1.1, wpb=494556, bsz=16733, num_updates=22900, lr=0.000417938, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=21216 epoch 014: 979 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=544871, ups=1.1, wpb=494556, bsz=16733, num_updates=22900, lr=0.000417938, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=21216 epoch 014: 979 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=544871, ups=1.1, wpb=494556, bsz=16733, num_updates=22900, lr=0.000417938, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=21216 epoch 014: 979 / 1689 loss=3.772, nll_loss=2.244, ppl=4.74, wps=544871, ups=1.1, wpb=494556, bsz=16733, num_updates=22900, lr=0.000417938, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=21216 epoch 014: 1080 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=540104, ups=1.09, wpb=494960, bsz=16522.5, num_updates=23000, lr=0.000417029, gnorm=0.193, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=21308 epoch 014: 1080 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=540104, ups=1.09, wpb=494960, bsz=16522.5, num_updates=23000, lr=0.000417029, gnorm=0.193, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=21308 epoch 014: 1080 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=540104, ups=1.09, wpb=494960, bsz=16522.5, num_updates=23000, lr=0.000417029, gnorm=0.193, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=21308 epoch 014: 1080 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=540104, ups=1.09, wpb=494960, bsz=16522.5, num_updates=23000, lr=0.000417029, gnorm=0.193, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=21308 epoch 014: 1080 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=540104, ups=1.09, wpb=494960, bsz=16522.5, num_updates=23000, lr=0.000417029, gnorm=0.193, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=21308 epoch 014: 1080 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=540104, ups=1.09, wpb=494960, bsz=16522.5, num_updates=23000, lr=0.000417029, gnorm=0.193, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=21308 epoch 014: 1080 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=540104, ups=1.09, wpb=494960, bsz=16522.5, num_updates=23000, lr=0.000417029, gnorm=0.193, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=21308 epoch 014: 1080 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=540104, ups=1.09, wpb=494960, bsz=16522.5, num_updates=23000, lr=0.000417029, gnorm=0.193, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=21308 epoch 014: 1080 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=540104, ups=1.09, wpb=494960, bsz=16522.5, num_updates=23000, lr=0.000417029, gnorm=0.193, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=21308 epoch 014: 1080 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=540104, ups=1.09, wpb=494960, bsz=16522.5, num_updates=23000, lr=0.000417029, gnorm=0.193, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=21308 epoch 014: 1080 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=540104, ups=1.09, wpb=494960, bsz=16522.5, num_updates=23000, lr=0.000417029, gnorm=0.193, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=21308 epoch 014: 1080 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=540104, ups=1.09, wpb=494960, bsz=16522.5, num_updates=23000, lr=0.000417029, gnorm=0.193, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=21308 epoch 014: 1080 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=540104, ups=1.09, wpb=494960, bsz=16522.5, num_updates=23000, lr=0.000417029, gnorm=0.193, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=21308 epoch 014: 1080 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=540104, ups=1.09, wpb=494960, bsz=16522.5, num_updates=23000, lr=0.000417029, gnorm=0.193, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=21308 begin validation on "valid" subset epoch 014 | valid on 'valid' subset | loss 3.757 | nll_loss 2.187 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.757 epoch 014 | valid on 'valid' subset | loss 3.757 | nll_loss 2.187 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.757 epoch 014 | valid on 'valid' subset | loss 3.757 | nll_loss 2.187 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.757 epoch 014 | valid on 'valid' subset | loss 3.757 | nll_loss 2.187 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.757 epoch 014 | valid on 'valid' subset | loss 3.757 | nll_loss 2.187 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.757 epoch 014 | valid on 'valid' subset | loss 3.757 | nll_loss 2.187 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.757 epoch 014 | valid on 'valid' subset | loss 3.757 | nll_loss 2.187 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.757 epoch 014 | valid on 'valid' subset | loss 3.757 | nll_loss 2.187 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.757 epoch 014 | valid on 'valid' subset | loss 3.757 | nll_loss 2.187 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.757 epoch 014 | valid on 'valid' subset | loss 3.757 | nll_loss 2.187 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.757 epoch 014 | valid on 'valid' subset | loss 3.757 | nll_loss 2.187 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.757 epoch 014 | valid on 'valid' subset | loss 3.757 | nll_loss 2.187 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.757 epoch 014 | valid on 'valid' subset | loss 3.757 | nll_loss 2.187 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.757 epoch 014 | valid on 'valid' subset | loss 3.757 | nll_loss 2.187 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 23000 | best_loss 3.757 epoch 014: 1180 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=445585, ups=0.9, wpb=493877, bsz=16457.9, num_updates=23100, lr=0.000416125, gnorm=0.201, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=21418 epoch 014: 1180 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=445585, ups=0.9, wpb=493877, bsz=16457.9, num_updates=23100, lr=0.000416125, gnorm=0.201, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=21418 epoch 014: 1180 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=445585, ups=0.9, wpb=493877, bsz=16457.9, num_updates=23100, lr=0.000416125, gnorm=0.201, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=21418 epoch 014: 1180 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=445585, ups=0.9, wpb=493877, bsz=16457.9, num_updates=23100, lr=0.000416125, gnorm=0.201, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=21418 epoch 014: 1180 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=445585, ups=0.9, wpb=493877, bsz=16457.9, num_updates=23100, lr=0.000416125, gnorm=0.201, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=21418 epoch 014: 1180 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=445585, ups=0.9, wpb=493877, bsz=16457.9, num_updates=23100, lr=0.000416125, gnorm=0.201, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=21418 epoch 014: 1180 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=445585, ups=0.9, wpb=493877, bsz=16457.9, num_updates=23100, lr=0.000416125, gnorm=0.201, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=21418 epoch 014: 1180 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=445585, ups=0.9, wpb=493877, bsz=16457.9, num_updates=23100, lr=0.000416125, gnorm=0.201, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=21418 epoch 014: 1180 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=445585, ups=0.9, wpb=493877, bsz=16457.9, num_updates=23100, lr=0.000416125, gnorm=0.201, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=21418 epoch 014: 1180 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=445585, ups=0.9, wpb=493877, bsz=16457.9, num_updates=23100, lr=0.000416125, gnorm=0.201, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=21418 epoch 014: 1180 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=445585, ups=0.9, wpb=493877, bsz=16457.9, num_updates=23100, lr=0.000416125, gnorm=0.201, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=21418 epoch 014: 1180 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=445585, ups=0.9, wpb=493877, bsz=16457.9, num_updates=23100, lr=0.000416125, gnorm=0.201, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=21418 epoch 014: 1180 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=445585, ups=0.9, wpb=493877, bsz=16457.9, num_updates=23100, lr=0.000416125, gnorm=0.201, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=21418 epoch 014: 1180 / 1689 loss=3.772, nll_loss=2.245, ppl=4.74, wps=445585, ups=0.9, wpb=493877, bsz=16457.9, num_updates=23100, lr=0.000416125, gnorm=0.201, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=21418 epoch 014: 1280 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=553044, ups=1.11, wpb=496123, bsz=16284.1, num_updates=23200, lr=0.000415227, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=21508 epoch 014: 1280 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=553044, ups=1.11, wpb=496123, bsz=16284.1, num_updates=23200, lr=0.000415227, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=21508 epoch 014: 1280 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=553044, ups=1.11, wpb=496123, bsz=16284.1, num_updates=23200, lr=0.000415227, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=21508 epoch 014: 1280 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=553044, ups=1.11, wpb=496123, bsz=16284.1, num_updates=23200, lr=0.000415227, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=21508 epoch 014: 1280 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=553044, ups=1.11, wpb=496123, bsz=16284.1, num_updates=23200, lr=0.000415227, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=21508 epoch 014: 1280 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=553044, ups=1.11, wpb=496123, bsz=16284.1, num_updates=23200, lr=0.000415227, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=21508 epoch 014: 1280 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=553044, ups=1.11, wpb=496123, bsz=16284.1, num_updates=23200, lr=0.000415227, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=21508 epoch 014: 1280 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=553044, ups=1.11, wpb=496123, bsz=16284.1, num_updates=23200, lr=0.000415227, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=21508 epoch 014: 1280 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=553044, ups=1.11, wpb=496123, bsz=16284.1, num_updates=23200, lr=0.000415227, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=21508 epoch 014: 1280 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=553044, ups=1.11, wpb=496123, bsz=16284.1, num_updates=23200, lr=0.000415227, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=21508 epoch 014: 1280 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=553044, ups=1.11, wpb=496123, bsz=16284.1, num_updates=23200, lr=0.000415227, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=21508 epoch 014: 1280 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=553044, ups=1.11, wpb=496123, bsz=16284.1, num_updates=23200, lr=0.000415227, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=21508 epoch 014: 1280 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=553044, ups=1.11, wpb=496123, bsz=16284.1, num_updates=23200, lr=0.000415227, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=21508 epoch 014: 1280 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=553044, ups=1.11, wpb=496123, bsz=16284.1, num_updates=23200, lr=0.000415227, gnorm=0.186, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=21508 epoch 014: 1380 / 1689 loss=3.774, nll_loss=2.247, ppl=4.75, wps=553677, ups=1.12, wpb=495380, bsz=16312.1, num_updates=23300, lr=0.000414335, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=21598 epoch 014: 1380 / 1689 loss=3.774, nll_loss=2.247, ppl=4.75, wps=553677, ups=1.12, wpb=495380, bsz=16312.1, num_updates=23300, lr=0.000414335, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=21598 epoch 014: 1380 / 1689 loss=3.774, nll_loss=2.247, ppl=4.75, wps=553677, ups=1.12, wpb=495380, bsz=16312.1, num_updates=23300, lr=0.000414335, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=21598 epoch 014: 1380 / 1689 loss=3.774, nll_loss=2.247, ppl=4.75, wps=553677, ups=1.12, wpb=495380, bsz=16312.1, num_updates=23300, lr=0.000414335, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=21598 epoch 014: 1380 / 1689 loss=3.774, nll_loss=2.247, ppl=4.75, wps=553677, ups=1.12, wpb=495380, bsz=16312.1, num_updates=23300, lr=0.000414335, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=21598 epoch 014: 1380 / 1689 loss=3.774, nll_loss=2.247, ppl=4.75, wps=553677, ups=1.12, wpb=495380, bsz=16312.1, num_updates=23300, lr=0.000414335, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=21598 epoch 014: 1380 / 1689 loss=3.774, nll_loss=2.247, ppl=4.75, wps=553677, ups=1.12, wpb=495380, bsz=16312.1, num_updates=23300, lr=0.000414335, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=21598 epoch 014: 1380 / 1689 loss=3.774, nll_loss=2.247, ppl=4.75, wps=553677, ups=1.12, wpb=495380, bsz=16312.1, num_updates=23300, lr=0.000414335, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=21598 epoch 014: 1380 / 1689 loss=3.774, nll_loss=2.247, ppl=4.75, wps=553677, ups=1.12, wpb=495380, bsz=16312.1, num_updates=23300, lr=0.000414335, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=21598 epoch 014: 1380 / 1689 loss=3.774, nll_loss=2.247, ppl=4.75, wps=553677, ups=1.12, wpb=495380, bsz=16312.1, num_updates=23300, lr=0.000414335, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=21598 epoch 014: 1380 / 1689 loss=3.774, nll_loss=2.247, ppl=4.75, wps=553677, ups=1.12, wpb=495380, bsz=16312.1, num_updates=23300, lr=0.000414335, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=21598 epoch 014: 1380 / 1689 loss=3.774, nll_loss=2.247, ppl=4.75, wps=553677, ups=1.12, wpb=495380, bsz=16312.1, num_updates=23300, lr=0.000414335, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=21598 epoch 014: 1380 / 1689 loss=3.774, nll_loss=2.247, ppl=4.75, wps=553677, ups=1.12, wpb=495380, bsz=16312.1, num_updates=23300, lr=0.000414335, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=21598 epoch 014: 1380 / 1689 loss=3.774, nll_loss=2.247, ppl=4.75, wps=553677, ups=1.12, wpb=495380, bsz=16312.1, num_updates=23300, lr=0.000414335, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=21598 epoch 014: 1480 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=552856, ups=1.12, wpb=495087, bsz=16655, num_updates=23400, lr=0.000413449, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=21687 epoch 014: 1480 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=552856, ups=1.12, wpb=495087, bsz=16655, num_updates=23400, lr=0.000413449, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=21687 epoch 014: 1480 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=552856, ups=1.12, wpb=495087, bsz=16655, num_updates=23400, lr=0.000413449, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=21687 epoch 014: 1480 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=552856, ups=1.12, wpb=495087, bsz=16655, num_updates=23400, lr=0.000413449, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=21687 epoch 014: 1480 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=552856, ups=1.12, wpb=495087, bsz=16655, num_updates=23400, lr=0.000413449, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=21687 epoch 014: 1480 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=552856, ups=1.12, wpb=495087, bsz=16655, num_updates=23400, lr=0.000413449, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=21687 epoch 014: 1480 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=552856, ups=1.12, wpb=495087, bsz=16655, num_updates=23400, lr=0.000413449, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=21687 epoch 014: 1480 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=552856, ups=1.12, wpb=495087, bsz=16655, num_updates=23400, lr=0.000413449, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=21687 epoch 014: 1480 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=552856, ups=1.12, wpb=495087, bsz=16655, num_updates=23400, lr=0.000413449, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=21687 epoch 014: 1480 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=552856, ups=1.12, wpb=495087, bsz=16655, num_updates=23400, lr=0.000413449, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=21687 epoch 014: 1480 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=552856, ups=1.12, wpb=495087, bsz=16655, num_updates=23400, lr=0.000413449, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=21687 epoch 014: 1480 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=552856, ups=1.12, wpb=495087, bsz=16655, num_updates=23400, lr=0.000413449, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=21687 epoch 014: 1480 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=552856, ups=1.12, wpb=495087, bsz=16655, num_updates=23400, lr=0.000413449, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=21687 epoch 014: 1480 / 1689 loss=3.773, nll_loss=2.246, ppl=4.74, wps=552856, ups=1.12, wpb=495087, bsz=16655, num_updates=23400, lr=0.000413449, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=21687 epoch 014: 1580 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=550799, ups=1.11, wpb=496824, bsz=16741, num_updates=23500, lr=0.000412568, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=21777 epoch 014: 1580 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=550799, ups=1.11, wpb=496824, bsz=16741, num_updates=23500, lr=0.000412568, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=21777 epoch 014: 1580 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=550799, ups=1.11, wpb=496824, bsz=16741, num_updates=23500, lr=0.000412568, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=21777 epoch 014: 1580 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=550799, ups=1.11, wpb=496824, bsz=16741, num_updates=23500, lr=0.000412568, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=21777 epoch 014: 1580 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=550799, ups=1.11, wpb=496824, bsz=16741, num_updates=23500, lr=0.000412568, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=21777 epoch 014: 1580 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=550799, ups=1.11, wpb=496824, bsz=16741, num_updates=23500, lr=0.000412568, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=21777 epoch 014: 1580 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=550799, ups=1.11, wpb=496824, bsz=16741, num_updates=23500, lr=0.000412568, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=21777 epoch 014: 1580 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=550799, ups=1.11, wpb=496824, bsz=16741, num_updates=23500, lr=0.000412568, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=21777 epoch 014: 1580 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=550799, ups=1.11, wpb=496824, bsz=16741, num_updates=23500, lr=0.000412568, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=21777 epoch 014: 1580 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=550799, ups=1.11, wpb=496824, bsz=16741, num_updates=23500, lr=0.000412568, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=21777 epoch 014: 1580 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=550799, ups=1.11, wpb=496824, bsz=16741, num_updates=23500, lr=0.000412568, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=21777 epoch 014: 1580 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=550799, ups=1.11, wpb=496824, bsz=16741, num_updates=23500, lr=0.000412568, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=21777 epoch 014: 1580 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=550799, ups=1.11, wpb=496824, bsz=16741, num_updates=23500, lr=0.000412568, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=21777 epoch 014: 1580 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=550799, ups=1.11, wpb=496824, bsz=16741, num_updates=23500, lr=0.000412568, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=21777 epoch 014: 1680 / 1689 loss=3.778, nll_loss=2.252, ppl=4.76, wps=551637, ups=1.12, wpb=493634, bsz=16480.2, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=21867 epoch 014: 1680 / 1689 loss=3.778, nll_loss=2.252, ppl=4.76, wps=551637, ups=1.12, wpb=493634, bsz=16480.2, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=21867 epoch 014: 1680 / 1689 loss=3.778, nll_loss=2.252, ppl=4.76, wps=551637, ups=1.12, wpb=493634, bsz=16480.2, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=21867 epoch 014: 1680 / 1689 loss=3.778, nll_loss=2.252, ppl=4.76, wps=551637, ups=1.12, wpb=493634, bsz=16480.2, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=21867 epoch 014: 1680 / 1689 loss=3.778, nll_loss=2.252, ppl=4.76, wps=551637, ups=1.12, wpb=493634, bsz=16480.2, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=21867 epoch 014: 1680 / 1689 loss=3.778, nll_loss=2.252, ppl=4.76, wps=551637, ups=1.12, wpb=493634, bsz=16480.2, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=21867 epoch 014: 1680 / 1689 loss=3.778, nll_loss=2.252, ppl=4.76, wps=551637, ups=1.12, wpb=493634, bsz=16480.2, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=21867 epoch 014: 1680 / 1689 loss=3.778, nll_loss=2.252, ppl=4.76, wps=551637, ups=1.12, wpb=493634, bsz=16480.2, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=21867 epoch 014: 1680 / 1689 loss=3.778, nll_loss=2.252, ppl=4.76, wps=551637, ups=1.12, wpb=493634, bsz=16480.2, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=21867 epoch 014: 1680 / 1689 loss=3.778, nll_loss=2.252, ppl=4.76, wps=551637, ups=1.12, wpb=493634, bsz=16480.2, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=21867 epoch 014: 1680 / 1689 loss=3.778, nll_loss=2.252, ppl=4.76, wps=551637, ups=1.12, wpb=493634, bsz=16480.2, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=21867 epoch 014: 1680 / 1689 loss=3.778, nll_loss=2.252, ppl=4.76, wps=551637, ups=1.12, wpb=493634, bsz=16480.2, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=21867 epoch 014: 1680 / 1689 loss=3.778, nll_loss=2.252, ppl=4.76, wps=551637, ups=1.12, wpb=493634, bsz=16480.2, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=21867 epoch 014: 1680 / 1689 loss=3.778, nll_loss=2.252, ppl=4.76, wps=551637, ups=1.12, wpb=493634, bsz=16480.2, num_updates=23600, lr=0.000411693, gnorm=0.189, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=21867 end of epoch 14 (average epoch stats below) epoch 014 | loss 3.769 | nll_loss 2.241 | ppl 4.73 | wps 536557 | ups 1.08 | wpb 495123 | bsz 16500.8 | num_updates 23609 | lr 0.000411615 | gnorm 0.19 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.7 | wall 21874 epoch 014 | loss 3.769 | nll_loss 2.241 | ppl 4.73 | wps 536557 | ups 1.08 | wpb 495123 | bsz 16500.8 | num_updates 23609 | lr 0.000411615 | gnorm 0.19 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.7 | wall 21874 epoch 014 | loss 3.769 | nll_loss 2.241 | ppl 4.73 | wps 536557 | ups 1.08 | wpb 495123 | bsz 16500.8 | num_updates 23609 | lr 0.000411615 | gnorm 0.19 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.7 | wall 21874 epoch 014 | loss 3.769 | nll_loss 2.241 | ppl 4.73 | wps 536557 | ups 1.08 | wpb 495123 | bsz 16500.8 | num_updates 23609 | lr 0.000411615 | gnorm 0.19 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.7 | wall 21874 epoch 014 | loss 3.769 | nll_loss 2.241 | ppl 4.73 | wps 536557 | ups 1.08 | wpb 495123 | bsz 16500.8 | num_updates 23609 | lr 0.000411615 | gnorm 0.19 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.7 | wall 21874 epoch 014 | loss 3.769 | nll_loss 2.241 | ppl 4.73 | wps 536557 | ups 1.08 | wpb 495123 | bsz 16500.8 | num_updates 23609 | lr 0.000411615 | gnorm 0.19 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.7 | wall 21874 epoch 014 | loss 3.769 | nll_loss 2.241 | ppl 4.73 | wps 536557 | ups 1.08 | wpb 495123 | bsz 16500.8 | num_updates 23609 | lr 0.000411615 | gnorm 0.19 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.7 | wall 21874 epoch 014 | loss 3.769 | nll_loss 2.241 | ppl 4.73 | wps 536557 | ups 1.08 | wpb 495123 | bsz 16500.8 | num_updates 23609 | lr 0.000411615 | gnorm 0.19 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.7 | wall 21874 epoch 014 | loss 3.769 | nll_loss 2.241 | ppl 4.73 | wps 536557 | ups 1.08 | wpb 495123 | bsz 16500.8 | num_updates 23609 | lr 0.000411615 | gnorm 0.19 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.7 | wall 21874 epoch 014 | loss 3.769 | nll_loss 2.241 | ppl 4.73 | wps 536557 | ups 1.08 | wpb 495123 | bsz 16500.8 | num_updates 23609 | lr 0.000411615 | gnorm 0.19 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.7 | wall 21874 epoch 014 | loss 3.769 | nll_loss 2.241 | ppl 4.73 | wps 536557 | ups 1.08 | wpb 495123 | bsz 16500.8 | num_updates 23609 | lr 0.000411615 | gnorm 0.19 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.7 | wall 21874 epoch 014 | loss 3.769 | nll_loss 2.241 | ppl 4.73 | wps 536557 | ups 1.08 | wpb 495123 | bsz 16500.8 | num_updates 23609 | lr 0.000411615 | gnorm 0.19 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.7 | wall 21874 epoch 014 | loss 3.769 | nll_loss 2.241 | ppl 4.73 | wps 536557 | ups 1.08 | wpb 495123 | bsz 16500.8 | num_updates 23609 | lr 0.000411615 | gnorm 0.19 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.7 | wall 21874 epoch 014 | loss 3.769 | nll_loss 2.241 | ppl 4.73 | wps 536557 | ups 1.08 | wpb 495123 | bsz 16500.8 | num_updates 23609 | lr 0.000411615 | gnorm 0.19 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 23.7 | wall 21874 Start iterating over samples epoch 015: 91 / 1689 loss=3.75, nll_loss=2.219, ppl=4.66, wps=524307, ups=1.07, wpb=490461, bsz=16233.9, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=21960 epoch 015: 91 / 1689 loss=3.75, nll_loss=2.219, ppl=4.66, wps=524307, ups=1.07, wpb=490461, bsz=16233.9, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=21960 epoch 015: 91 / 1689 loss=3.75, nll_loss=2.219, ppl=4.66, wps=524307, ups=1.07, wpb=490461, bsz=16233.9, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=21960 epoch 015: 91 / 1689 loss=3.75, nll_loss=2.219, ppl=4.66, wps=524307, ups=1.07, wpb=490461, bsz=16233.9, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=21960 epoch 015: 91 / 1689 loss=3.75, nll_loss=2.219, ppl=4.66, wps=524307, ups=1.07, wpb=490461, bsz=16233.9, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=21960 epoch 015: 91 / 1689 loss=3.75, nll_loss=2.219, ppl=4.66, wps=524307, ups=1.07, wpb=490461, bsz=16233.9, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=21960 epoch 015: 91 / 1689 loss=3.75, nll_loss=2.219, ppl=4.66, wps=524307, ups=1.07, wpb=490461, bsz=16233.9, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=21960 epoch 015: 91 / 1689 loss=3.75, nll_loss=2.219, ppl=4.66, wps=524307, ups=1.07, wpb=490461, bsz=16233.9, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=21960 epoch 015: 91 / 1689 loss=3.75, nll_loss=2.219, ppl=4.66, wps=524307, ups=1.07, wpb=490461, bsz=16233.9, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=21960 epoch 015: 91 / 1689 loss=3.75, nll_loss=2.219, ppl=4.66, wps=524307, ups=1.07, wpb=490461, bsz=16233.9, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=21960 epoch 015: 91 / 1689 loss=3.75, nll_loss=2.219, ppl=4.66, wps=524307, ups=1.07, wpb=490461, bsz=16233.9, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=21960 epoch 015: 91 / 1689 loss=3.75, nll_loss=2.219, ppl=4.66, wps=524307, ups=1.07, wpb=490461, bsz=16233.9, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=21960 epoch 015: 91 / 1689 loss=3.75, nll_loss=2.219, ppl=4.66, wps=524307, ups=1.07, wpb=490461, bsz=16233.9, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=21960 epoch 015: 91 / 1689 loss=3.75, nll_loss=2.219, ppl=4.66, wps=524307, ups=1.07, wpb=490461, bsz=16233.9, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=21960 epoch 015: 91 / 1689 loss=3.75, nll_loss=2.219, ppl=4.66, wps=524307, ups=1.07, wpb=490461, bsz=16233.9, num_updates=23700, lr=0.000410824, gnorm=0.204, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=21960 epoch 015: 192 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=547968, ups=1.11, wpb=494316, bsz=16408.4, num_updates=23800, lr=0.00040996, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=22051 epoch 015: 192 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=547968, ups=1.11, wpb=494316, bsz=16408.4, num_updates=23800, lr=0.00040996, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=22051 epoch 015: 192 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=547968, ups=1.11, wpb=494316, bsz=16408.4, num_updates=23800, lr=0.00040996, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=22051 epoch 015: 192 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=547968, ups=1.11, wpb=494316, bsz=16408.4, num_updates=23800, lr=0.00040996, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=22051 epoch 015: 192 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=547968, ups=1.11, wpb=494316, bsz=16408.4, num_updates=23800, lr=0.00040996, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=22051 epoch 015: 192 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=547968, ups=1.11, wpb=494316, bsz=16408.4, num_updates=23800, lr=0.00040996, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=22051 epoch 015: 192 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=547968, ups=1.11, wpb=494316, bsz=16408.4, num_updates=23800, lr=0.00040996, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=22051 epoch 015: 192 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=547968, ups=1.11, wpb=494316, bsz=16408.4, num_updates=23800, lr=0.00040996, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=22051 epoch 015: 192 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=547968, ups=1.11, wpb=494316, bsz=16408.4, num_updates=23800, lr=0.00040996, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=22051 epoch 015: 192 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=547968, ups=1.11, wpb=494316, bsz=16408.4, num_updates=23800, lr=0.00040996, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=22051 epoch 015: 192 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=547968, ups=1.11, wpb=494316, bsz=16408.4, num_updates=23800, lr=0.00040996, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=22051 epoch 015: 192 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=547968, ups=1.11, wpb=494316, bsz=16408.4, num_updates=23800, lr=0.00040996, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=22051 epoch 015: 192 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=547968, ups=1.11, wpb=494316, bsz=16408.4, num_updates=23800, lr=0.00040996, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=22051 epoch 015: 192 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=547968, ups=1.11, wpb=494316, bsz=16408.4, num_updates=23800, lr=0.00040996, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=22051 epoch 015: 192 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=547968, ups=1.11, wpb=494316, bsz=16408.4, num_updates=23800, lr=0.00040996, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=22051 epoch 015: 292 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=549582, ups=1.11, wpb=495350, bsz=16485.5, num_updates=23900, lr=0.000409101, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22141 epoch 015: 292 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=549582, ups=1.11, wpb=495350, bsz=16485.5, num_updates=23900, lr=0.000409101, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22141 epoch 015: 292 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=549582, ups=1.11, wpb=495350, bsz=16485.5, num_updates=23900, lr=0.000409101, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22141 epoch 015: 292 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=549582, ups=1.11, wpb=495350, bsz=16485.5, num_updates=23900, lr=0.000409101, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22141 epoch 015: 292 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=549582, ups=1.11, wpb=495350, bsz=16485.5, num_updates=23900, lr=0.000409101, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22141 epoch 015: 292 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=549582, ups=1.11, wpb=495350, bsz=16485.5, num_updates=23900, lr=0.000409101, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22141 epoch 015: 292 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=549582, ups=1.11, wpb=495350, bsz=16485.5, num_updates=23900, lr=0.000409101, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22141 epoch 015: 292 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=549582, ups=1.11, wpb=495350, bsz=16485.5, num_updates=23900, lr=0.000409101, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22141 epoch 015: 292 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=549582, ups=1.11, wpb=495350, bsz=16485.5, num_updates=23900, lr=0.000409101, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22141 epoch 015: 292 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=549582, ups=1.11, wpb=495350, bsz=16485.5, num_updates=23900, lr=0.000409101, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22141 epoch 015: 292 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=549582, ups=1.11, wpb=495350, bsz=16485.5, num_updates=23900, lr=0.000409101, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22141 epoch 015: 292 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=549582, ups=1.11, wpb=495350, bsz=16485.5, num_updates=23900, lr=0.000409101, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22141 epoch 015: 292 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=549582, ups=1.11, wpb=495350, bsz=16485.5, num_updates=23900, lr=0.000409101, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22141 epoch 015: 292 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=549582, ups=1.11, wpb=495350, bsz=16485.5, num_updates=23900, lr=0.000409101, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22141 epoch 015: 292 / 1689 loss=3.753, nll_loss=2.223, ppl=4.67, wps=549582, ups=1.11, wpb=495350, bsz=16485.5, num_updates=23900, lr=0.000409101, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22141 epoch 015: 392 / 1689 loss=3.755, nll_loss=2.226, ppl=4.68, wps=551140, ups=1.11, wpb=496535, bsz=16874.7, num_updates=24000, lr=0.000408248, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=22231 epoch 015: 392 / 1689 loss=3.755, nll_loss=2.226, ppl=4.68, wps=551140, ups=1.11, wpb=496535, bsz=16874.7, num_updates=24000, lr=0.000408248, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=22231 epoch 015: 392 / 1689 loss=3.755, nll_loss=2.226, ppl=4.68, wps=551140, ups=1.11, wpb=496535, bsz=16874.7, num_updates=24000, lr=0.000408248, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=22231 epoch 015: 392 / 1689 loss=3.755, nll_loss=2.226, ppl=4.68, wps=551140, ups=1.11, wpb=496535, bsz=16874.7, num_updates=24000, lr=0.000408248, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=22231 epoch 015: 392 / 1689 loss=3.755, nll_loss=2.226, ppl=4.68, wps=551140, ups=1.11, wpb=496535, bsz=16874.7, num_updates=24000, lr=0.000408248, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=22231 epoch 015: 392 / 1689 loss=3.755, nll_loss=2.226, ppl=4.68, wps=551140, ups=1.11, wpb=496535, bsz=16874.7, num_updates=24000, lr=0.000408248, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=22231 epoch 015: 392 / 1689 loss=3.755, nll_loss=2.226, ppl=4.68, wps=551140, ups=1.11, wpb=496535, bsz=16874.7, num_updates=24000, lr=0.000408248, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=22231 epoch 015: 392 / 1689 loss=3.755, nll_loss=2.226, ppl=4.68, wps=551140, ups=1.11, wpb=496535, bsz=16874.7, num_updates=24000, lr=0.000408248, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=22231 epoch 015: 392 / 1689 loss=3.755, nll_loss=2.226, ppl=4.68, wps=551140, ups=1.11, wpb=496535, bsz=16874.7, num_updates=24000, lr=0.000408248, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=22231 epoch 015: 392 / 1689 loss=3.755, nll_loss=2.226, ppl=4.68, wps=551140, ups=1.11, wpb=496535, bsz=16874.7, num_updates=24000, lr=0.000408248, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=22231 epoch 015: 392 / 1689 loss=3.755, nll_loss=2.226, ppl=4.68, wps=551140, ups=1.11, wpb=496535, bsz=16874.7, num_updates=24000, lr=0.000408248, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=22231 epoch 015: 392 / 1689 loss=3.755, nll_loss=2.226, ppl=4.68, wps=551140, ups=1.11, wpb=496535, bsz=16874.7, num_updates=24000, lr=0.000408248, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=22231 epoch 015: 392 / 1689 loss=3.755, nll_loss=2.226, ppl=4.68, wps=551140, ups=1.11, wpb=496535, bsz=16874.7, num_updates=24000, lr=0.000408248, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=22231 epoch 015: 392 / 1689 loss=3.755, nll_loss=2.226, ppl=4.68, wps=551140, ups=1.11, wpb=496535, bsz=16874.7, num_updates=24000, lr=0.000408248, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=22231 epoch 015: 392 / 1689 loss=3.755, nll_loss=2.226, ppl=4.68, wps=551140, ups=1.11, wpb=496535, bsz=16874.7, num_updates=24000, lr=0.000408248, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=22231 begin validation on "valid" subset epoch 015 | valid on 'valid' subset | loss 3.765 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.757 epoch 015 | valid on 'valid' subset | loss 3.765 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.757 epoch 015 | valid on 'valid' subset | loss 3.765 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.757 epoch 015 | valid on 'valid' subset | loss 3.765 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.757 epoch 015 | valid on 'valid' subset | loss 3.765 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.757 epoch 015 | valid on 'valid' subset | loss 3.765 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.757 epoch 015 | valid on 'valid' subset | loss 3.765 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.757 epoch 015 | valid on 'valid' subset | loss 3.765 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.757 epoch 015 | valid on 'valid' subset | loss 3.765 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.757 epoch 015 | valid on 'valid' subset | loss 3.765 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.757 epoch 015 | valid on 'valid' subset | loss 3.765 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.757 epoch 015 | valid on 'valid' subset | loss 3.765 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.757 epoch 015 | valid on 'valid' subset | loss 3.765 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.757 epoch 015 | valid on 'valid' subset | loss 3.765 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.757 epoch 015 | valid on 'valid' subset | loss 3.765 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 24000 | best_loss 3.757 epoch 015: 492 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=486164, ups=0.98, wpb=495608, bsz=16727.9, num_updates=24100, lr=0.0004074, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22333 epoch 015: 492 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=486164, ups=0.98, wpb=495608, bsz=16727.9, num_updates=24100, lr=0.0004074, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22333 epoch 015: 492 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=486164, ups=0.98, wpb=495608, bsz=16727.9, num_updates=24100, lr=0.0004074, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22333 epoch 015: 492 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=486164, ups=0.98, wpb=495608, bsz=16727.9, num_updates=24100, lr=0.0004074, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22333 epoch 015: 492 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=486164, ups=0.98, wpb=495608, bsz=16727.9, num_updates=24100, lr=0.0004074, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22333 epoch 015: 492 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=486164, ups=0.98, wpb=495608, bsz=16727.9, num_updates=24100, lr=0.0004074, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22333 epoch 015: 492 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=486164, ups=0.98, wpb=495608, bsz=16727.9, num_updates=24100, lr=0.0004074, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22333 epoch 015: 492 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=486164, ups=0.98, wpb=495608, bsz=16727.9, num_updates=24100, lr=0.0004074, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22333 epoch 015: 492 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=486164, ups=0.98, wpb=495608, bsz=16727.9, num_updates=24100, lr=0.0004074, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22333 epoch 015: 492 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=486164, ups=0.98, wpb=495608, bsz=16727.9, num_updates=24100, lr=0.0004074, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22333 epoch 015: 492 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=486164, ups=0.98, wpb=495608, bsz=16727.9, num_updates=24100, lr=0.0004074, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22333 epoch 015: 492 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=486164, ups=0.98, wpb=495608, bsz=16727.9, num_updates=24100, lr=0.0004074, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22333 epoch 015: 492 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=486164, ups=0.98, wpb=495608, bsz=16727.9, num_updates=24100, lr=0.0004074, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22333 epoch 015: 492 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=486164, ups=0.98, wpb=495608, bsz=16727.9, num_updates=24100, lr=0.0004074, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22333 epoch 015: 492 / 1689 loss=3.754, nll_loss=2.224, ppl=4.67, wps=486164, ups=0.98, wpb=495608, bsz=16727.9, num_updates=24100, lr=0.0004074, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=22333 epoch 015: 592 / 1689 loss=3.76, nll_loss=2.23, ppl=4.69, wps=552501, ups=1.12, wpb=494286, bsz=16352.1, num_updates=24200, lr=0.000406558, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=22422 epoch 015: 592 / 1689 loss=3.76, nll_loss=2.23, ppl=4.69, wps=552501, ups=1.12, wpb=494286, bsz=16352.1, num_updates=24200, lr=0.000406558, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=22422 epoch 015: 592 / 1689 loss=3.76, nll_loss=2.23, ppl=4.69, wps=552501, ups=1.12, wpb=494286, bsz=16352.1, num_updates=24200, lr=0.000406558, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=22422 epoch 015: 592 / 1689 loss=3.76, nll_loss=2.23, ppl=4.69, wps=552501, ups=1.12, wpb=494286, bsz=16352.1, num_updates=24200, lr=0.000406558, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=22422 epoch 015: 592 / 1689 loss=3.76, nll_loss=2.23, ppl=4.69, wps=552501, ups=1.12, wpb=494286, bsz=16352.1, num_updates=24200, lr=0.000406558, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=22422 epoch 015: 592 / 1689 loss=3.76, nll_loss=2.23, ppl=4.69, wps=552501, ups=1.12, wpb=494286, bsz=16352.1, num_updates=24200, lr=0.000406558, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=22422 epoch 015: 592 / 1689 loss=3.76, nll_loss=2.23, ppl=4.69, wps=552501, ups=1.12, wpb=494286, bsz=16352.1, num_updates=24200, lr=0.000406558, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=22422 epoch 015: 592 / 1689 loss=3.76, nll_loss=2.23, ppl=4.69, wps=552501, ups=1.12, wpb=494286, bsz=16352.1, num_updates=24200, lr=0.000406558, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=22422 epoch 015: 592 / 1689 loss=3.76, nll_loss=2.23, ppl=4.69, wps=552501, ups=1.12, wpb=494286, bsz=16352.1, num_updates=24200, lr=0.000406558, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=22422 epoch 015: 592 / 1689 loss=3.76, nll_loss=2.23, ppl=4.69, wps=552501, ups=1.12, wpb=494286, bsz=16352.1, num_updates=24200, lr=0.000406558, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=22422 epoch 015: 592 / 1689 loss=3.76, nll_loss=2.23, ppl=4.69, wps=552501, ups=1.12, wpb=494286, bsz=16352.1, num_updates=24200, lr=0.000406558, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=22422 epoch 015: 592 / 1689 loss=3.76, nll_loss=2.23, ppl=4.69, wps=552501, ups=1.12, wpb=494286, bsz=16352.1, num_updates=24200, lr=0.000406558, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=22422 epoch 015: 592 / 1689 loss=3.76, nll_loss=2.23, ppl=4.69, wps=552501, ups=1.12, wpb=494286, bsz=16352.1, num_updates=24200, lr=0.000406558, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=22422 epoch 015: 592 / 1689 loss=3.76, nll_loss=2.23, ppl=4.69, wps=552501, ups=1.12, wpb=494286, bsz=16352.1, num_updates=24200, lr=0.000406558, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=22422 epoch 015: 592 / 1689 loss=3.76, nll_loss=2.23, ppl=4.69, wps=552501, ups=1.12, wpb=494286, bsz=16352.1, num_updates=24200, lr=0.000406558, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=22422 epoch 015: 693 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=540878, ups=1.09, wpb=495143, bsz=16649.4, num_updates=24300, lr=0.00040572, gnorm=0.19, clip=0, loss_scale=1, train_wall=90, gb_free=20.4, wall=22514 epoch 015: 693 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=540878, ups=1.09, wpb=495143, bsz=16649.4, num_updates=24300, lr=0.00040572, gnorm=0.19, clip=0, loss_scale=1, train_wall=90, gb_free=20.4, wall=22514 epoch 015: 693 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=540878, ups=1.09, wpb=495143, bsz=16649.4, num_updates=24300, lr=0.00040572, gnorm=0.19, clip=0, loss_scale=1, train_wall=90, gb_free=20.4, wall=22514 epoch 015: 693 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=540878, ups=1.09, wpb=495143, bsz=16649.4, num_updates=24300, lr=0.00040572, gnorm=0.19, clip=0, loss_scale=1, train_wall=90, gb_free=20.4, wall=22514 epoch 015: 693 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=540878, ups=1.09, wpb=495143, bsz=16649.4, num_updates=24300, lr=0.00040572, gnorm=0.19, clip=0, loss_scale=1, train_wall=90, gb_free=20.4, wall=22514 epoch 015: 693 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=540878, ups=1.09, wpb=495143, bsz=16649.4, num_updates=24300, lr=0.00040572, gnorm=0.19, clip=0, loss_scale=1, train_wall=90, gb_free=20.4, wall=22514 epoch 015: 693 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=540878, ups=1.09, wpb=495143, bsz=16649.4, num_updates=24300, lr=0.00040572, gnorm=0.19, clip=0, loss_scale=1, train_wall=90, gb_free=20.4, wall=22514 epoch 015: 693 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=540878, ups=1.09, wpb=495143, bsz=16649.4, num_updates=24300, lr=0.00040572, gnorm=0.19, clip=0, loss_scale=1, train_wall=90, gb_free=20.4, wall=22514 epoch 015: 693 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=540878, ups=1.09, wpb=495143, bsz=16649.4, num_updates=24300, lr=0.00040572, gnorm=0.19, clip=0, loss_scale=1, train_wall=90, gb_free=20.4, wall=22514 epoch 015: 693 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=540878, ups=1.09, wpb=495143, bsz=16649.4, num_updates=24300, lr=0.00040572, gnorm=0.19, clip=0, loss_scale=1, train_wall=90, gb_free=20.4, wall=22514 epoch 015: 693 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=540878, ups=1.09, wpb=495143, bsz=16649.4, num_updates=24300, lr=0.00040572, gnorm=0.19, clip=0, loss_scale=1, train_wall=90, gb_free=20.4, wall=22514 epoch 015: 693 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=540878, ups=1.09, wpb=495143, bsz=16649.4, num_updates=24300, lr=0.00040572, gnorm=0.19, clip=0, loss_scale=1, train_wall=90, gb_free=20.4, wall=22514 epoch 015: 693 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=540878, ups=1.09, wpb=495143, bsz=16649.4, num_updates=24300, lr=0.00040572, gnorm=0.19, clip=0, loss_scale=1, train_wall=90, gb_free=20.4, wall=22514 epoch 015: 693 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=540878, ups=1.09, wpb=495143, bsz=16649.4, num_updates=24300, lr=0.00040572, gnorm=0.19, clip=0, loss_scale=1, train_wall=90, gb_free=20.4, wall=22514 epoch 015: 693 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=540878, ups=1.09, wpb=495143, bsz=16649.4, num_updates=24300, lr=0.00040572, gnorm=0.19, clip=0, loss_scale=1, train_wall=90, gb_free=20.4, wall=22514 epoch 015: 793 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=551278, ups=1.11, wpb=495120, bsz=16311.6, num_updates=24400, lr=0.000404888, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=22604 epoch 015: 793 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=551278, ups=1.11, wpb=495120, bsz=16311.6, num_updates=24400, lr=0.000404888, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=22604 epoch 015: 793 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=551278, ups=1.11, wpb=495120, bsz=16311.6, num_updates=24400, lr=0.000404888, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=22604 epoch 015: 793 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=551278, ups=1.11, wpb=495120, bsz=16311.6, num_updates=24400, lr=0.000404888, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=22604 epoch 015: 793 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=551278, ups=1.11, wpb=495120, bsz=16311.6, num_updates=24400, lr=0.000404888, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=22604 epoch 015: 793 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=551278, ups=1.11, wpb=495120, bsz=16311.6, num_updates=24400, lr=0.000404888, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=22604 epoch 015: 793 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=551278, ups=1.11, wpb=495120, bsz=16311.6, num_updates=24400, lr=0.000404888, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=22604 epoch 015: 793 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=551278, ups=1.11, wpb=495120, bsz=16311.6, num_updates=24400, lr=0.000404888, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=22604 epoch 015: 793 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=551278, ups=1.11, wpb=495120, bsz=16311.6, num_updates=24400, lr=0.000404888, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=22604 epoch 015: 793 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=551278, ups=1.11, wpb=495120, bsz=16311.6, num_updates=24400, lr=0.000404888, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=22604 epoch 015: 793 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=551278, ups=1.11, wpb=495120, bsz=16311.6, num_updates=24400, lr=0.000404888, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=22604 epoch 015: 793 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=551278, ups=1.11, wpb=495120, bsz=16311.6, num_updates=24400, lr=0.000404888, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=22604 epoch 015: 793 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=551278, ups=1.11, wpb=495120, bsz=16311.6, num_updates=24400, lr=0.000404888, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=22604 epoch 015: 793 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=551278, ups=1.11, wpb=495120, bsz=16311.6, num_updates=24400, lr=0.000404888, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=22604 epoch 015: 793 / 1689 loss=3.76, nll_loss=2.231, ppl=4.69, wps=551278, ups=1.11, wpb=495120, bsz=16311.6, num_updates=24400, lr=0.000404888, gnorm=0.187, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=22604 epoch 015: 893 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=551316, ups=1.12, wpb=494452, bsz=16202.6, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22693 epoch 015: 893 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=551316, ups=1.12, wpb=494452, bsz=16202.6, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22693 epoch 015: 893 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=551316, ups=1.12, wpb=494452, bsz=16202.6, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22693 epoch 015: 893 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=551316, ups=1.12, wpb=494452, bsz=16202.6, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22693 epoch 015: 893 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=551316, ups=1.12, wpb=494452, bsz=16202.6, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22693 epoch 015: 893 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=551316, ups=1.12, wpb=494452, bsz=16202.6, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22693 epoch 015: 893 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=551316, ups=1.12, wpb=494452, bsz=16202.6, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22693 epoch 015: 893 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=551316, ups=1.12, wpb=494452, bsz=16202.6, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22693 epoch 015: 893 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=551316, ups=1.12, wpb=494452, bsz=16202.6, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22693 epoch 015: 893 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=551316, ups=1.12, wpb=494452, bsz=16202.6, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22693 epoch 015: 893 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=551316, ups=1.12, wpb=494452, bsz=16202.6, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22693 epoch 015: 893 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=551316, ups=1.12, wpb=494452, bsz=16202.6, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22693 epoch 015: 893 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=551316, ups=1.12, wpb=494452, bsz=16202.6, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22693 epoch 015: 893 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=551316, ups=1.12, wpb=494452, bsz=16202.6, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22693 epoch 015: 893 / 1689 loss=3.762, nll_loss=2.234, ppl=4.7, wps=551316, ups=1.12, wpb=494452, bsz=16202.6, num_updates=24500, lr=0.000404061, gnorm=0.185, clip=0, loss_scale=1, train_wall=88, gb_free=22, wall=22693 epoch 015: 993 / 1689 loss=3.764, nll_loss=2.236, ppl=4.71, wps=553264, ups=1.11, wpb=497377, bsz=16264.6, num_updates=24600, lr=0.000403239, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=22783 epoch 015: 993 / 1689 loss=3.764, nll_loss=2.236, ppl=4.71, wps=553264, ups=1.11, wpb=497377, bsz=16264.6, num_updates=24600, lr=0.000403239, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=22783 epoch 015: 993 / 1689 loss=3.764, nll_loss=2.236, ppl=4.71, wps=553264, ups=1.11, wpb=497377, bsz=16264.6, num_updates=24600, lr=0.000403239, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=22783 epoch 015: 993 / 1689 loss=3.764, nll_loss=2.236, ppl=4.71, wps=553264, ups=1.11, wpb=497377, bsz=16264.6, num_updates=24600, lr=0.000403239, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=22783 epoch 015: 993 / 1689 loss=3.764, nll_loss=2.236, ppl=4.71, wps=553264, ups=1.11, wpb=497377, bsz=16264.6, num_updates=24600, lr=0.000403239, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=22783 epoch 015: 993 / 1689 loss=3.764, nll_loss=2.236, ppl=4.71, wps=553264, ups=1.11, wpb=497377, bsz=16264.6, num_updates=24600, lr=0.000403239, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=22783 epoch 015: 993 / 1689 loss=3.764, nll_loss=2.236, ppl=4.71, wps=553264, ups=1.11, wpb=497377, bsz=16264.6, num_updates=24600, lr=0.000403239, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=22783 epoch 015: 993 / 1689 loss=3.764, nll_loss=2.236, ppl=4.71, wps=553264, ups=1.11, wpb=497377, bsz=16264.6, num_updates=24600, lr=0.000403239, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=22783 epoch 015: 993 / 1689 loss=3.764, nll_loss=2.236, ppl=4.71, wps=553264, ups=1.11, wpb=497377, bsz=16264.6, num_updates=24600, lr=0.000403239, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=22783 epoch 015: 993 / 1689 loss=3.764, nll_loss=2.236, ppl=4.71, wps=553264, ups=1.11, wpb=497377, bsz=16264.6, num_updates=24600, lr=0.000403239, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=22783 epoch 015: 993 / 1689 loss=3.764, nll_loss=2.236, ppl=4.71, wps=553264, ups=1.11, wpb=497377, bsz=16264.6, num_updates=24600, lr=0.000403239, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=22783 epoch 015: 993 / 1689 loss=3.764, nll_loss=2.236, ppl=4.71, wps=553264, ups=1.11, wpb=497377, bsz=16264.6, num_updates=24600, lr=0.000403239, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=22783 epoch 015: 993 / 1689 loss=3.764, nll_loss=2.236, ppl=4.71, wps=553264, ups=1.11, wpb=497377, bsz=16264.6, num_updates=24600, lr=0.000403239, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=22783 epoch 015: 993 / 1689 loss=3.764, nll_loss=2.236, ppl=4.71, wps=553264, ups=1.11, wpb=497377, bsz=16264.6, num_updates=24600, lr=0.000403239, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=22783 epoch 015: 993 / 1689 loss=3.764, nll_loss=2.236, ppl=4.71, wps=553264, ups=1.11, wpb=497377, bsz=16264.6, num_updates=24600, lr=0.000403239, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=22783 epoch 015: 1093 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=557734, ups=1.13, wpb=495391, bsz=16800.3, num_updates=24700, lr=0.000402422, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=22872 epoch 015: 1093 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=557734, ups=1.13, wpb=495391, bsz=16800.3, num_updates=24700, lr=0.000402422, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=22872 epoch 015: 1093 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=557734, ups=1.13, wpb=495391, bsz=16800.3, num_updates=24700, lr=0.000402422, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=22872 epoch 015: 1093 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=557734, ups=1.13, wpb=495391, bsz=16800.3, num_updates=24700, lr=0.000402422, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=22872 epoch 015: 1093 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=557734, ups=1.13, wpb=495391, bsz=16800.3, num_updates=24700, lr=0.000402422, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=22872 epoch 015: 1093 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=557734, ups=1.13, wpb=495391, bsz=16800.3, num_updates=24700, lr=0.000402422, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=22872 epoch 015: 1093 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=557734, ups=1.13, wpb=495391, bsz=16800.3, num_updates=24700, lr=0.000402422, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=22872 epoch 015: 1093 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=557734, ups=1.13, wpb=495391, bsz=16800.3, num_updates=24700, lr=0.000402422, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=22872 epoch 015: 1093 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=557734, ups=1.13, wpb=495391, bsz=16800.3, num_updates=24700, lr=0.000402422, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=22872 epoch 015: 1093 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=557734, ups=1.13, wpb=495391, bsz=16800.3, num_updates=24700, lr=0.000402422, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=22872 epoch 015: 1093 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=557734, ups=1.13, wpb=495391, bsz=16800.3, num_updates=24700, lr=0.000402422, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=22872 epoch 015: 1093 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=557734, ups=1.13, wpb=495391, bsz=16800.3, num_updates=24700, lr=0.000402422, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=22872 epoch 015: 1093 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=557734, ups=1.13, wpb=495391, bsz=16800.3, num_updates=24700, lr=0.000402422, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=22872 epoch 015: 1093 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=557734, ups=1.13, wpb=495391, bsz=16800.3, num_updates=24700, lr=0.000402422, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=22872 epoch 015: 1093 / 1689 loss=3.756, nll_loss=2.227, ppl=4.68, wps=557734, ups=1.13, wpb=495391, bsz=16800.3, num_updates=24700, lr=0.000402422, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=22872 epoch 015: 1193 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=557522, ups=1.13, wpb=493901, bsz=16230, num_updates=24800, lr=0.00040161, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=22961 epoch 015: 1193 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=557522, ups=1.13, wpb=493901, bsz=16230, num_updates=24800, lr=0.00040161, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=22961 epoch 015: 1193 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=557522, ups=1.13, wpb=493901, bsz=16230, num_updates=24800, lr=0.00040161, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=22961 epoch 015: 1193 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=557522, ups=1.13, wpb=493901, bsz=16230, num_updates=24800, lr=0.00040161, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=22961 epoch 015: 1193 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=557522, ups=1.13, wpb=493901, bsz=16230, num_updates=24800, lr=0.00040161, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=22961 epoch 015: 1193 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=557522, ups=1.13, wpb=493901, bsz=16230, num_updates=24800, lr=0.00040161, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=22961 epoch 015: 1193 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=557522, ups=1.13, wpb=493901, bsz=16230, num_updates=24800, lr=0.00040161, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=22961 epoch 015: 1193 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=557522, ups=1.13, wpb=493901, bsz=16230, num_updates=24800, lr=0.00040161, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=22961 epoch 015: 1193 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=557522, ups=1.13, wpb=493901, bsz=16230, num_updates=24800, lr=0.00040161, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=22961 epoch 015: 1193 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=557522, ups=1.13, wpb=493901, bsz=16230, num_updates=24800, lr=0.00040161, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=22961 epoch 015: 1193 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=557522, ups=1.13, wpb=493901, bsz=16230, num_updates=24800, lr=0.00040161, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=22961 epoch 015: 1193 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=557522, ups=1.13, wpb=493901, bsz=16230, num_updates=24800, lr=0.00040161, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=22961 epoch 015: 1193 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=557522, ups=1.13, wpb=493901, bsz=16230, num_updates=24800, lr=0.00040161, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=22961 epoch 015: 1193 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=557522, ups=1.13, wpb=493901, bsz=16230, num_updates=24800, lr=0.00040161, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=22961 epoch 015: 1193 / 1689 loss=3.771, nll_loss=2.244, ppl=4.74, wps=557522, ups=1.13, wpb=493901, bsz=16230, num_updates=24800, lr=0.00040161, gnorm=0.198, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=22961 epoch 015: 1293 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=551899, ups=1.11, wpb=495180, bsz=17003.4, num_updates=24900, lr=0.000400802, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=23050 epoch 015: 1293 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=551899, ups=1.11, wpb=495180, bsz=17003.4, num_updates=24900, lr=0.000400802, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=23050 epoch 015: 1293 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=551899, ups=1.11, wpb=495180, bsz=17003.4, num_updates=24900, lr=0.000400802, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=23050 epoch 015: 1293 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=551899, ups=1.11, wpb=495180, bsz=17003.4, num_updates=24900, lr=0.000400802, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=23050 epoch 015: 1293 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=551899, ups=1.11, wpb=495180, bsz=17003.4, num_updates=24900, lr=0.000400802, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=23050 epoch 015: 1293 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=551899, ups=1.11, wpb=495180, bsz=17003.4, num_updates=24900, lr=0.000400802, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=23050 epoch 015: 1293 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=551899, ups=1.11, wpb=495180, bsz=17003.4, num_updates=24900, lr=0.000400802, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=23050 epoch 015: 1293 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=551899, ups=1.11, wpb=495180, bsz=17003.4, num_updates=24900, lr=0.000400802, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=23050 epoch 015: 1293 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=551899, ups=1.11, wpb=495180, bsz=17003.4, num_updates=24900, lr=0.000400802, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=23050 epoch 015: 1293 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=551899, ups=1.11, wpb=495180, bsz=17003.4, num_updates=24900, lr=0.000400802, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=23050 epoch 015: 1293 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=551899, ups=1.11, wpb=495180, bsz=17003.4, num_updates=24900, lr=0.000400802, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=23050 epoch 015: 1293 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=551899, ups=1.11, wpb=495180, bsz=17003.4, num_updates=24900, lr=0.000400802, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=23050 epoch 015: 1293 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=551899, ups=1.11, wpb=495180, bsz=17003.4, num_updates=24900, lr=0.000400802, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=23050 epoch 015: 1293 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=551899, ups=1.11, wpb=495180, bsz=17003.4, num_updates=24900, lr=0.000400802, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=23050 epoch 015: 1293 / 1689 loss=3.754, nll_loss=2.225, ppl=4.68, wps=551899, ups=1.11, wpb=495180, bsz=17003.4, num_updates=24900, lr=0.000400802, gnorm=0.18, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=23050 epoch 015: 1393 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=557582, ups=1.12, wpb=497054, bsz=16506.8, num_updates=25000, lr=0.0004, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=23140 epoch 015: 1393 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=557582, ups=1.12, wpb=497054, bsz=16506.8, num_updates=25000, lr=0.0004, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=23140 epoch 015: 1393 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=557582, ups=1.12, wpb=497054, bsz=16506.8, num_updates=25000, lr=0.0004, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=23140 epoch 015: 1393 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=557582, ups=1.12, wpb=497054, bsz=16506.8, num_updates=25000, lr=0.0004, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=23140 epoch 015: 1393 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=557582, ups=1.12, wpb=497054, bsz=16506.8, num_updates=25000, lr=0.0004, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=23140 epoch 015: 1393 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=557582, ups=1.12, wpb=497054, bsz=16506.8, num_updates=25000, lr=0.0004, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=23140 epoch 015: 1393 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=557582, ups=1.12, wpb=497054, bsz=16506.8, num_updates=25000, lr=0.0004, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=23140 epoch 015: 1393 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=557582, ups=1.12, wpb=497054, bsz=16506.8, num_updates=25000, lr=0.0004, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=23140 epoch 015: 1393 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=557582, ups=1.12, wpb=497054, bsz=16506.8, num_updates=25000, lr=0.0004, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=23140 epoch 015: 1393 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=557582, ups=1.12, wpb=497054, bsz=16506.8, num_updates=25000, lr=0.0004, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=23140 epoch 015: 1393 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=557582, ups=1.12, wpb=497054, bsz=16506.8, num_updates=25000, lr=0.0004, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=23140 epoch 015: 1393 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=557582, ups=1.12, wpb=497054, bsz=16506.8, num_updates=25000, lr=0.0004, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=23140 epoch 015: 1393 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=557582, ups=1.12, wpb=497054, bsz=16506.8, num_updates=25000, lr=0.0004, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=23140 epoch 015: 1393 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=557582, ups=1.12, wpb=497054, bsz=16506.8, num_updates=25000, lr=0.0004, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=23140 epoch 015: 1393 / 1689 loss=3.758, nll_loss=2.229, ppl=4.69, wps=557582, ups=1.12, wpb=497054, bsz=16506.8, num_updates=25000, lr=0.0004, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=23140 begin validation on "valid" subset epoch 015 | valid on 'valid' subset | loss 3.753 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.753 epoch 015 | valid on 'valid' subset | loss 3.753 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.753 epoch 015 | valid on 'valid' subset | loss 3.753 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.753 epoch 015 | valid on 'valid' subset | loss 3.753 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.753 epoch 015 | valid on 'valid' subset | loss 3.753 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.753 epoch 015 | valid on 'valid' subset | loss 3.753 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.753 epoch 015 | valid on 'valid' subset | loss 3.753 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.753 epoch 015 | valid on 'valid' subset | loss 3.753 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.753 epoch 015 | valid on 'valid' subset | loss 3.753 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.753 epoch 015 | valid on 'valid' subset | loss 3.753 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.753 epoch 015 | valid on 'valid' subset | loss 3.753 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.753 epoch 015 | valid on 'valid' subset | loss 3.753 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.753 epoch 015 | valid on 'valid' subset | loss 3.753 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.753 epoch 015 | valid on 'valid' subset | loss 3.753 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.753 epoch 015 | valid on 'valid' subset | loss 3.753 | nll_loss 2.185 | ppl 4.55 | wps 0 | wpb 44526 | bsz 2008 | num_updates 25000 | best_loss 3.753 epoch 015: 1493 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=450973, ups=0.91, wpb=494776, bsz=16904.3, num_updates=25100, lr=0.000399202, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=23249 epoch 015: 1493 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=450973, ups=0.91, wpb=494776, bsz=16904.3, num_updates=25100, lr=0.000399202, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=23249 epoch 015: 1493 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=450973, ups=0.91, wpb=494776, bsz=16904.3, num_updates=25100, lr=0.000399202, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=23249 epoch 015: 1493 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=450973, ups=0.91, wpb=494776, bsz=16904.3, num_updates=25100, lr=0.000399202, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=23249 epoch 015: 1493 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=450973, ups=0.91, wpb=494776, bsz=16904.3, num_updates=25100, lr=0.000399202, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=23249 epoch 015: 1493 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=450973, ups=0.91, wpb=494776, bsz=16904.3, num_updates=25100, lr=0.000399202, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=23249 epoch 015: 1493 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=450973, ups=0.91, wpb=494776, bsz=16904.3, num_updates=25100, lr=0.000399202, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=23249 epoch 015: 1493 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=450973, ups=0.91, wpb=494776, bsz=16904.3, num_updates=25100, lr=0.000399202, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=23249 epoch 015: 1493 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=450973, ups=0.91, wpb=494776, bsz=16904.3, num_updates=25100, lr=0.000399202, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=23249 epoch 015: 1493 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=450973, ups=0.91, wpb=494776, bsz=16904.3, num_updates=25100, lr=0.000399202, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=23249 epoch 015: 1493 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=450973, ups=0.91, wpb=494776, bsz=16904.3, num_updates=25100, lr=0.000399202, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=23249 epoch 015: 1493 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=450973, ups=0.91, wpb=494776, bsz=16904.3, num_updates=25100, lr=0.000399202, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=23249 epoch 015: 1493 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=450973, ups=0.91, wpb=494776, bsz=16904.3, num_updates=25100, lr=0.000399202, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=23249 epoch 015: 1493 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=450973, ups=0.91, wpb=494776, bsz=16904.3, num_updates=25100, lr=0.000399202, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=23249 epoch 015: 1493 / 1689 loss=3.759, nll_loss=2.23, ppl=4.69, wps=450973, ups=0.91, wpb=494776, bsz=16904.3, num_updates=25100, lr=0.000399202, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=23249 epoch 015: 1593 / 1689 loss=3.768, nll_loss=2.24, ppl=4.73, wps=549559, ups=1.11, wpb=496105, bsz=16486.5, num_updates=25200, lr=0.00039841, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23339 epoch 015: 1593 / 1689 loss=3.768, nll_loss=2.24, ppl=4.73, wps=549559, ups=1.11, wpb=496105, bsz=16486.5, num_updates=25200, lr=0.00039841, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23339 epoch 015: 1593 / 1689 loss=3.768, nll_loss=2.24, ppl=4.73, wps=549559, ups=1.11, wpb=496105, bsz=16486.5, num_updates=25200, lr=0.00039841, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23339 epoch 015: 1593 / 1689 loss=3.768, nll_loss=2.24, ppl=4.73, wps=549559, ups=1.11, wpb=496105, bsz=16486.5, num_updates=25200, lr=0.00039841, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23339 epoch 015: 1593 / 1689 loss=3.768, nll_loss=2.24, ppl=4.73, wps=549559, ups=1.11, wpb=496105, bsz=16486.5, num_updates=25200, lr=0.00039841, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23339 epoch 015: 1593 / 1689 loss=3.768, nll_loss=2.24, ppl=4.73, wps=549559, ups=1.11, wpb=496105, bsz=16486.5, num_updates=25200, lr=0.00039841, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23339 epoch 015: 1593 / 1689 loss=3.768, nll_loss=2.24, ppl=4.73, wps=549559, ups=1.11, wpb=496105, bsz=16486.5, num_updates=25200, lr=0.00039841, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23339 epoch 015: 1593 / 1689 loss=3.768, nll_loss=2.24, ppl=4.73, wps=549559, ups=1.11, wpb=496105, bsz=16486.5, num_updates=25200, lr=0.00039841, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23339 epoch 015: 1593 / 1689 loss=3.768, nll_loss=2.24, ppl=4.73, wps=549559, ups=1.11, wpb=496105, bsz=16486.5, num_updates=25200, lr=0.00039841, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23339 epoch 015: 1593 / 1689 loss=3.768, nll_loss=2.24, ppl=4.73, wps=549559, ups=1.11, wpb=496105, bsz=16486.5, num_updates=25200, lr=0.00039841, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23339 epoch 015: 1593 / 1689 loss=3.768, nll_loss=2.24, ppl=4.73, wps=549559, ups=1.11, wpb=496105, bsz=16486.5, num_updates=25200, lr=0.00039841, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23339 epoch 015: 1593 / 1689 loss=3.768, nll_loss=2.24, ppl=4.73, wps=549559, ups=1.11, wpb=496105, bsz=16486.5, num_updates=25200, lr=0.00039841, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23339 epoch 015: 1593 / 1689 loss=3.768, nll_loss=2.24, ppl=4.73, wps=549559, ups=1.11, wpb=496105, bsz=16486.5, num_updates=25200, lr=0.00039841, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23339 epoch 015: 1593 / 1689 loss=3.768, nll_loss=2.24, ppl=4.73, wps=549559, ups=1.11, wpb=496105, bsz=16486.5, num_updates=25200, lr=0.00039841, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23339 epoch 015: 1593 / 1689 loss=3.768, nll_loss=2.24, ppl=4.73, wps=549559, ups=1.11, wpb=496105, bsz=16486.5, num_updates=25200, lr=0.00039841, gnorm=0.203, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=23339 end of epoch 15 (average epoch stats below) epoch 015 | loss 3.759 | nll_loss 2.23 | ppl 4.69 | wps 538686 | ups 1.09 | wpb 495108 | bsz 16506.4 | num_updates 25296 | lr 0.000397653 | gnorm 0.188 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 24.1 | wall 23425 epoch 015 | loss 3.759 | nll_loss 2.23 | ppl 4.69 | wps 538686 | ups 1.09 | wpb 495108 | bsz 16506.4 | num_updates 25296 | lr 0.000397653 | gnorm 0.188 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 24.1 | wall 23425 epoch 015 | loss 3.759 | nll_loss 2.23 | ppl 4.69 | wps 538686 | ups 1.09 | wpb 495108 | bsz 16506.4 | num_updates 25296 | lr 0.000397653 | gnorm 0.188 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 24.1 | wall 23425 epoch 015 | loss 3.759 | nll_loss 2.23 | ppl 4.69 | wps 538686 | ups 1.09 | wpb 495108 | bsz 16506.4 | num_updates 25296 | lr 0.000397653 | gnorm 0.188 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 24.1 | wall 23425 epoch 015 | loss 3.759 | nll_loss 2.23 | ppl 4.69 | wps 538686 | ups 1.09 | wpb 495108 | bsz 16506.4 | num_updates 25296 | lr 0.000397653 | gnorm 0.188 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 24.1 | wall 23425 epoch 015 | loss 3.759 | nll_loss 2.23 | ppl 4.69 | wps 538686 | ups 1.09 | wpb 495108 | bsz 16506.4 | num_updates 25296 | lr 0.000397653 | gnorm 0.188 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 24.1 | wall 23425 epoch 015 | loss 3.759 | nll_loss 2.23 | ppl 4.69 | wps 538686 | ups 1.09 | wpb 495108 | bsz 16506.4 | num_updates 25296 | lr 0.000397653 | gnorm 0.188 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 24.1 | wall 23425 epoch 015 | loss 3.759 | nll_loss 2.23 | ppl 4.69 | wps 538686 | ups 1.09 | wpb 495108 | bsz 16506.4 | num_updates 25296 | lr 0.000397653 | gnorm 0.188 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 24.1 | wall 23425 epoch 015 | loss 3.759 | nll_loss 2.23 | ppl 4.69 | wps 538686 | ups 1.09 | wpb 495108 | bsz 16506.4 | num_updates 25296 | lr 0.000397653 | gnorm 0.188 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 24.1 | wall 23425 epoch 015 | loss 3.759 | nll_loss 2.23 | ppl 4.69 | wps 538686 | ups 1.09 | wpb 495108 | bsz 16506.4 | num_updates 25296 | lr 0.000397653 | gnorm 0.188 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 24.1 | wall 23425 epoch 015 | loss 3.759 | nll_loss 2.23 | ppl 4.69 | wps 538686 | ups 1.09 | wpb 495108 | bsz 16506.4 | num_updates 25296 | lr 0.000397653 | gnorm 0.188 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 24.1 | wall 23425 epoch 015 | loss 3.759 | nll_loss 2.23 | ppl 4.69 | wps 538686 | ups 1.09 | wpb 495108 | bsz 16506.4 | num_updates 25296 | lr 0.000397653 | gnorm 0.188 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 24.1 | wall 23425 epoch 015 | loss 3.759 | nll_loss 2.23 | ppl 4.69 | wps 538686 | ups 1.09 | wpb 495108 | bsz 16506.4 | num_updates 25296 | lr 0.000397653 | gnorm 0.188 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 24.1 | wall 23425 epoch 015 | loss 3.759 | nll_loss 2.23 | ppl 4.69 | wps 538686 | ups 1.09 | wpb 495108 | bsz 16506.4 | num_updates 25296 | lr 0.000397653 | gnorm 0.188 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 24.1 | wall 23425 epoch 015 | loss 3.759 | nll_loss 2.23 | ppl 4.69 | wps 538686 | ups 1.09 | wpb 495108 | bsz 16506.4 | num_updates 25296 | lr 0.000397653 | gnorm 0.188 | clip 0 | loss_scale 4 | train_wall 1493 | gb_free 24.1 | wall 23425 Start iterating over samples epoch 016: 4 / 1689 loss=3.767, nll_loss=2.239, ppl=4.72, wps=549186, ups=1.12, wpb=491768, bsz=16016.6, num_updates=25300, lr=0.000397621, gnorm=0.179, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=23429 epoch 016: 4 / 1689 loss=3.767, nll_loss=2.239, ppl=4.72, wps=549186, ups=1.12, wpb=491768, bsz=16016.6, num_updates=25300, lr=0.000397621, gnorm=0.179, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=23429 epoch 016: 4 / 1689 loss=3.767, nll_loss=2.239, ppl=4.72, wps=549186, ups=1.12, wpb=491768, bsz=16016.6, num_updates=25300, lr=0.000397621, gnorm=0.179, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=23429 epoch 016: 4 / 1689 loss=3.767, nll_loss=2.239, ppl=4.72, wps=549186, ups=1.12, wpb=491768, bsz=16016.6, num_updates=25300, lr=0.000397621, gnorm=0.179, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=23429 epoch 016: 4 / 1689 loss=3.767, nll_loss=2.239, ppl=4.72, wps=549186, ups=1.12, wpb=491768, bsz=16016.6, num_updates=25300, lr=0.000397621, gnorm=0.179, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=23429 epoch 016: 4 / 1689 loss=3.767, nll_loss=2.239, ppl=4.72, wps=549186, ups=1.12, wpb=491768, bsz=16016.6, num_updates=25300, lr=0.000397621, gnorm=0.179, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=23429 epoch 016: 4 / 1689 loss=3.767, nll_loss=2.239, ppl=4.72, wps=549186, ups=1.12, wpb=491768, bsz=16016.6, num_updates=25300, lr=0.000397621, gnorm=0.179, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=23429 epoch 016: 4 / 1689 loss=3.767, nll_loss=2.239, ppl=4.72, wps=549186, ups=1.12, wpb=491768, bsz=16016.6, num_updates=25300, lr=0.000397621, gnorm=0.179, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=23429 epoch 016: 4 / 1689 loss=3.767, nll_loss=2.239, ppl=4.72, wps=549186, ups=1.12, wpb=491768, bsz=16016.6, num_updates=25300, lr=0.000397621, gnorm=0.179, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=23429 epoch 016: 4 / 1689 loss=3.767, nll_loss=2.239, ppl=4.72, wps=549186, ups=1.12, wpb=491768, bsz=16016.6, num_updates=25300, lr=0.000397621, gnorm=0.179, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=23429 epoch 016: 4 / 1689 loss=3.767, nll_loss=2.239, ppl=4.72, wps=549186, ups=1.12, wpb=491768, bsz=16016.6, num_updates=25300, lr=0.000397621, gnorm=0.179, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=23429 epoch 016: 4 / 1689 loss=3.767, nll_loss=2.239, ppl=4.72, wps=549186, ups=1.12, wpb=491768, bsz=16016.6, num_updates=25300, lr=0.000397621, gnorm=0.179, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=23429 epoch 016: 4 / 1689 loss=3.767, nll_loss=2.239, ppl=4.72, wps=549186, ups=1.12, wpb=491768, bsz=16016.6, num_updates=25300, lr=0.000397621, gnorm=0.179, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=23429 epoch 016: 4 / 1689 loss=3.767, nll_loss=2.239, ppl=4.72, wps=549186, ups=1.12, wpb=491768, bsz=16016.6, num_updates=25300, lr=0.000397621, gnorm=0.179, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=23429 epoch 016: 4 / 1689 loss=3.767, nll_loss=2.239, ppl=4.72, wps=549186, ups=1.12, wpb=491768, bsz=16016.6, num_updates=25300, lr=0.000397621, gnorm=0.179, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=23429 epoch 016: 4 / 1689 loss=3.767, nll_loss=2.239, ppl=4.72, wps=549186, ups=1.12, wpb=491768, bsz=16016.6, num_updates=25300, lr=0.000397621, gnorm=0.179, clip=0, loss_scale=4, train_wall=87, gb_free=21.4, wall=23429 epoch 016: 105 / 1689 loss=3.745, nll_loss=2.214, ppl=4.64, wps=546827, ups=1.1, wpb=495040, bsz=16338.9, num_updates=25400, lr=0.000396838, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=23520 epoch 016: 105 / 1689 loss=3.745, nll_loss=2.214, ppl=4.64, wps=546827, ups=1.1, wpb=495040, bsz=16338.9, num_updates=25400, lr=0.000396838, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=23520 epoch 016: 105 / 1689 loss=3.745, nll_loss=2.214, ppl=4.64, wps=546827, ups=1.1, wpb=495040, bsz=16338.9, num_updates=25400, lr=0.000396838, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=23520 epoch 016: 105 / 1689 loss=3.745, nll_loss=2.214, ppl=4.64, wps=546827, ups=1.1, wpb=495040, bsz=16338.9, num_updates=25400, lr=0.000396838, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=23520 epoch 016: 105 / 1689 loss=3.745, nll_loss=2.214, ppl=4.64, wps=546827, ups=1.1, wpb=495040, bsz=16338.9, num_updates=25400, lr=0.000396838, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=23520 epoch 016: 105 / 1689 loss=3.745, nll_loss=2.214, ppl=4.64, wps=546827, ups=1.1, wpb=495040, bsz=16338.9, num_updates=25400, lr=0.000396838, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=23520 epoch 016: 105 / 1689 loss=3.745, nll_loss=2.214, ppl=4.64, wps=546827, ups=1.1, wpb=495040, bsz=16338.9, num_updates=25400, lr=0.000396838, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=23520 epoch 016: 105 / 1689 loss=3.745, nll_loss=2.214, ppl=4.64, wps=546827, ups=1.1, wpb=495040, bsz=16338.9, num_updates=25400, lr=0.000396838, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=23520 epoch 016: 105 / 1689 loss=3.745, nll_loss=2.214, ppl=4.64, wps=546827, ups=1.1, wpb=495040, bsz=16338.9, num_updates=25400, lr=0.000396838, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=23520 epoch 016: 105 / 1689 loss=3.745, nll_loss=2.214, ppl=4.64, wps=546827, ups=1.1, wpb=495040, bsz=16338.9, num_updates=25400, lr=0.000396838, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=23520 epoch 016: 105 / 1689 loss=3.745, nll_loss=2.214, ppl=4.64, wps=546827, ups=1.1, wpb=495040, bsz=16338.9, num_updates=25400, lr=0.000396838, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=23520 epoch 016: 105 / 1689 loss=3.745, nll_loss=2.214, ppl=4.64, wps=546827, ups=1.1, wpb=495040, bsz=16338.9, num_updates=25400, lr=0.000396838, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=23520 epoch 016: 105 / 1689 loss=3.745, nll_loss=2.214, ppl=4.64, wps=546827, ups=1.1, wpb=495040, bsz=16338.9, num_updates=25400, lr=0.000396838, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=23520 epoch 016: 105 / 1689 loss=3.745, nll_loss=2.214, ppl=4.64, wps=546827, ups=1.1, wpb=495040, bsz=16338.9, num_updates=25400, lr=0.000396838, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=23520 epoch 016: 105 / 1689 loss=3.745, nll_loss=2.214, ppl=4.64, wps=546827, ups=1.1, wpb=495040, bsz=16338.9, num_updates=25400, lr=0.000396838, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=23520 epoch 016: 105 / 1689 loss=3.745, nll_loss=2.214, ppl=4.64, wps=546827, ups=1.1, wpb=495040, bsz=16338.9, num_updates=25400, lr=0.000396838, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=23520 epoch 016: 205 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=546005, ups=1.1, wpb=496651, bsz=16672.2, num_updates=25500, lr=0.000396059, gnorm=0.181, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23611 epoch 016: 205 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=546005, ups=1.1, wpb=496651, bsz=16672.2, num_updates=25500, lr=0.000396059, gnorm=0.181, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23611 epoch 016: 205 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=546005, ups=1.1, wpb=496651, bsz=16672.2, num_updates=25500, lr=0.000396059, gnorm=0.181, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23611 epoch 016: 205 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=546005, ups=1.1, wpb=496651, bsz=16672.2, num_updates=25500, lr=0.000396059, gnorm=0.181, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23611 epoch 016: 205 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=546005, ups=1.1, wpb=496651, bsz=16672.2, num_updates=25500, lr=0.000396059, gnorm=0.181, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23611 epoch 016: 205 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=546005, ups=1.1, wpb=496651, bsz=16672.2, num_updates=25500, lr=0.000396059, gnorm=0.181, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23611 epoch 016: 205 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=546005, ups=1.1, wpb=496651, bsz=16672.2, num_updates=25500, lr=0.000396059, gnorm=0.181, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23611 epoch 016: 205 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=546005, ups=1.1, wpb=496651, bsz=16672.2, num_updates=25500, lr=0.000396059, gnorm=0.181, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23611 epoch 016: 205 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=546005, ups=1.1, wpb=496651, bsz=16672.2, num_updates=25500, lr=0.000396059, gnorm=0.181, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23611 epoch 016: 205 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=546005, ups=1.1, wpb=496651, bsz=16672.2, num_updates=25500, lr=0.000396059, gnorm=0.181, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23611 epoch 016: 205 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=546005, ups=1.1, wpb=496651, bsz=16672.2, num_updates=25500, lr=0.000396059, gnorm=0.181, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23611 epoch 016: 205 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=546005, ups=1.1, wpb=496651, bsz=16672.2, num_updates=25500, lr=0.000396059, gnorm=0.181, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23611 epoch 016: 205 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=546005, ups=1.1, wpb=496651, bsz=16672.2, num_updates=25500, lr=0.000396059, gnorm=0.181, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23611 epoch 016: 205 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=546005, ups=1.1, wpb=496651, bsz=16672.2, num_updates=25500, lr=0.000396059, gnorm=0.181, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23611 epoch 016: 205 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=546005, ups=1.1, wpb=496651, bsz=16672.2, num_updates=25500, lr=0.000396059, gnorm=0.181, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23611 epoch 016: 205 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=546005, ups=1.1, wpb=496651, bsz=16672.2, num_updates=25500, lr=0.000396059, gnorm=0.181, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=23611 epoch 016: 305 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=548885, ups=1.11, wpb=493438, bsz=16198.4, num_updates=25600, lr=0.000395285, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=23700 epoch 016: 305 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=548885, ups=1.11, wpb=493438, bsz=16198.4, num_updates=25600, lr=0.000395285, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=23700 epoch 016: 305 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=548885, ups=1.11, wpb=493438, bsz=16198.4, num_updates=25600, lr=0.000395285, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=23700 epoch 016: 305 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=548885, ups=1.11, wpb=493438, bsz=16198.4, num_updates=25600, lr=0.000395285, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=23700 epoch 016: 305 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=548885, ups=1.11, wpb=493438, bsz=16198.4, num_updates=25600, lr=0.000395285, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=23700 epoch 016: 305 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=548885, ups=1.11, wpb=493438, bsz=16198.4, num_updates=25600, lr=0.000395285, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=23700 epoch 016: 305 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=548885, ups=1.11, wpb=493438, bsz=16198.4, num_updates=25600, lr=0.000395285, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=23700 epoch 016: 305 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=548885, ups=1.11, wpb=493438, bsz=16198.4, num_updates=25600, lr=0.000395285, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=23700 epoch 016: 305 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=548885, ups=1.11, wpb=493438, bsz=16198.4, num_updates=25600, lr=0.000395285, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=23700 epoch 016: 305 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=548885, ups=1.11, wpb=493438, bsz=16198.4, num_updates=25600, lr=0.000395285, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=23700 epoch 016: 305 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=548885, ups=1.11, wpb=493438, bsz=16198.4, num_updates=25600, lr=0.000395285, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=23700 epoch 016: 305 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=548885, ups=1.11, wpb=493438, bsz=16198.4, num_updates=25600, lr=0.000395285, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=23700 epoch 016: 305 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=548885, ups=1.11, wpb=493438, bsz=16198.4, num_updates=25600, lr=0.000395285, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=23700 epoch 016: 305 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=548885, ups=1.11, wpb=493438, bsz=16198.4, num_updates=25600, lr=0.000395285, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=23700 epoch 016: 305 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=548885, ups=1.11, wpb=493438, bsz=16198.4, num_updates=25600, lr=0.000395285, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=23700 epoch 016: 305 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=548885, ups=1.11, wpb=493438, bsz=16198.4, num_updates=25600, lr=0.000395285, gnorm=0.188, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=23700 epoch 016: 405 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=551220, ups=1.11, wpb=494949, bsz=16288.2, num_updates=25700, lr=0.000394515, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=23790 epoch 016: 405 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=551220, ups=1.11, wpb=494949, bsz=16288.2, num_updates=25700, lr=0.000394515, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=23790 epoch 016: 405 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=551220, ups=1.11, wpb=494949, bsz=16288.2, num_updates=25700, lr=0.000394515, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=23790 epoch 016: 405 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=551220, ups=1.11, wpb=494949, bsz=16288.2, num_updates=25700, lr=0.000394515, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=23790 epoch 016: 405 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=551220, ups=1.11, wpb=494949, bsz=16288.2, num_updates=25700, lr=0.000394515, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=23790 epoch 016: 405 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=551220, ups=1.11, wpb=494949, bsz=16288.2, num_updates=25700, lr=0.000394515, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=23790 epoch 016: 405 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=551220, ups=1.11, wpb=494949, bsz=16288.2, num_updates=25700, lr=0.000394515, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=23790 epoch 016: 405 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=551220, ups=1.11, wpb=494949, bsz=16288.2, num_updates=25700, lr=0.000394515, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=23790 epoch 016: 405 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=551220, ups=1.11, wpb=494949, bsz=16288.2, num_updates=25700, lr=0.000394515, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=23790 epoch 016: 405 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=551220, ups=1.11, wpb=494949, bsz=16288.2, num_updates=25700, lr=0.000394515, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=23790 epoch 016: 405 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=551220, ups=1.11, wpb=494949, bsz=16288.2, num_updates=25700, lr=0.000394515, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=23790 epoch 016: 405 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=551220, ups=1.11, wpb=494949, bsz=16288.2, num_updates=25700, lr=0.000394515, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=23790 epoch 016: 405 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=551220, ups=1.11, wpb=494949, bsz=16288.2, num_updates=25700, lr=0.000394515, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=23790 epoch 016: 405 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=551220, ups=1.11, wpb=494949, bsz=16288.2, num_updates=25700, lr=0.000394515, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=23790 epoch 016: 405 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=551220, ups=1.11, wpb=494949, bsz=16288.2, num_updates=25700, lr=0.000394515, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=23790 epoch 016: 405 / 1689 loss=3.747, nll_loss=2.217, ppl=4.65, wps=551220, ups=1.11, wpb=494949, bsz=16288.2, num_updates=25700, lr=0.000394515, gnorm=0.189, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=23790 epoch 016: 505 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=549398, ups=1.11, wpb=495967, bsz=16123.8, num_updates=25800, lr=0.00039375, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23881 epoch 016: 505 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=549398, ups=1.11, wpb=495967, bsz=16123.8, num_updates=25800, lr=0.00039375, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23881 epoch 016: 505 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=549398, ups=1.11, wpb=495967, bsz=16123.8, num_updates=25800, lr=0.00039375, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23881 epoch 016: 505 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=549398, ups=1.11, wpb=495967, bsz=16123.8, num_updates=25800, lr=0.00039375, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23881 epoch 016: 505 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=549398, ups=1.11, wpb=495967, bsz=16123.8, num_updates=25800, lr=0.00039375, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23881 epoch 016: 505 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=549398, ups=1.11, wpb=495967, bsz=16123.8, num_updates=25800, lr=0.00039375, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23881 epoch 016: 505 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=549398, ups=1.11, wpb=495967, bsz=16123.8, num_updates=25800, lr=0.00039375, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23881 epoch 016: 505 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=549398, ups=1.11, wpb=495967, bsz=16123.8, num_updates=25800, lr=0.00039375, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23881 epoch 016: 505 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=549398, ups=1.11, wpb=495967, bsz=16123.8, num_updates=25800, lr=0.00039375, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23881 epoch 016: 505 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=549398, ups=1.11, wpb=495967, bsz=16123.8, num_updates=25800, lr=0.00039375, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23881 epoch 016: 505 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=549398, ups=1.11, wpb=495967, bsz=16123.8, num_updates=25800, lr=0.00039375, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23881 epoch 016: 505 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=549398, ups=1.11, wpb=495967, bsz=16123.8, num_updates=25800, lr=0.00039375, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23881 epoch 016: 505 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=549398, ups=1.11, wpb=495967, bsz=16123.8, num_updates=25800, lr=0.00039375, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23881 epoch 016: 505 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=549398, ups=1.11, wpb=495967, bsz=16123.8, num_updates=25800, lr=0.00039375, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23881 epoch 016: 505 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=549398, ups=1.11, wpb=495967, bsz=16123.8, num_updates=25800, lr=0.00039375, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23881 epoch 016: 505 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=549398, ups=1.11, wpb=495967, bsz=16123.8, num_updates=25800, lr=0.00039375, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=23881 epoch 016: 605 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=547294, ups=1.11, wpb=495159, bsz=16584.2, num_updates=25900, lr=0.000392989, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=23971 epoch 016: 605 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=547294, ups=1.11, wpb=495159, bsz=16584.2, num_updates=25900, lr=0.000392989, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=23971 epoch 016: 605 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=547294, ups=1.11, wpb=495159, bsz=16584.2, num_updates=25900, lr=0.000392989, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=23971 epoch 016: 605 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=547294, ups=1.11, wpb=495159, bsz=16584.2, num_updates=25900, lr=0.000392989, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=23971 epoch 016: 605 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=547294, ups=1.11, wpb=495159, bsz=16584.2, num_updates=25900, lr=0.000392989, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=23971 epoch 016: 605 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=547294, ups=1.11, wpb=495159, bsz=16584.2, num_updates=25900, lr=0.000392989, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=23971 epoch 016: 605 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=547294, ups=1.11, wpb=495159, bsz=16584.2, num_updates=25900, lr=0.000392989, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=23971 epoch 016: 605 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=547294, ups=1.11, wpb=495159, bsz=16584.2, num_updates=25900, lr=0.000392989, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=23971 epoch 016: 605 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=547294, ups=1.11, wpb=495159, bsz=16584.2, num_updates=25900, lr=0.000392989, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=23971 epoch 016: 605 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=547294, ups=1.11, wpb=495159, bsz=16584.2, num_updates=25900, lr=0.000392989, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=23971 epoch 016: 605 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=547294, ups=1.11, wpb=495159, bsz=16584.2, num_updates=25900, lr=0.000392989, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=23971 epoch 016: 605 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=547294, ups=1.11, wpb=495159, bsz=16584.2, num_updates=25900, lr=0.000392989, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=23971 epoch 016: 605 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=547294, ups=1.11, wpb=495159, bsz=16584.2, num_updates=25900, lr=0.000392989, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=23971 epoch 016: 605 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=547294, ups=1.11, wpb=495159, bsz=16584.2, num_updates=25900, lr=0.000392989, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=23971 epoch 016: 605 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=547294, ups=1.11, wpb=495159, bsz=16584.2, num_updates=25900, lr=0.000392989, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=23971 epoch 016: 605 / 1689 loss=3.746, nll_loss=2.216, ppl=4.64, wps=547294, ups=1.11, wpb=495159, bsz=16584.2, num_updates=25900, lr=0.000392989, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=20.4, wall=23971 epoch 016: 705 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=545270, ups=1.1, wpb=493974, bsz=16717.8, num_updates=26000, lr=0.000392232, gnorm=0.188, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=24062 epoch 016: 705 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=545270, ups=1.1, wpb=493974, bsz=16717.8, num_updates=26000, lr=0.000392232, gnorm=0.188, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=24062 epoch 016: 705 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=545270, ups=1.1, wpb=493974, bsz=16717.8, num_updates=26000, lr=0.000392232, gnorm=0.188, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=24062 epoch 016: 705 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=545270, ups=1.1, wpb=493974, bsz=16717.8, num_updates=26000, lr=0.000392232, gnorm=0.188, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=24062 epoch 016: 705 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=545270, ups=1.1, wpb=493974, bsz=16717.8, num_updates=26000, lr=0.000392232, gnorm=0.188, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=24062 epoch 016: 705 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=545270, ups=1.1, wpb=493974, bsz=16717.8, num_updates=26000, lr=0.000392232, gnorm=0.188, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=24062 epoch 016: 705 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=545270, ups=1.1, wpb=493974, bsz=16717.8, num_updates=26000, lr=0.000392232, gnorm=0.188, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=24062 epoch 016: 705 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=545270, ups=1.1, wpb=493974, bsz=16717.8, num_updates=26000, lr=0.000392232, gnorm=0.188, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=24062 epoch 016: 705 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=545270, ups=1.1, wpb=493974, bsz=16717.8, num_updates=26000, lr=0.000392232, gnorm=0.188, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=24062 epoch 016: 705 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=545270, ups=1.1, wpb=493974, bsz=16717.8, num_updates=26000, lr=0.000392232, gnorm=0.188, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=24062 epoch 016: 705 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=545270, ups=1.1, wpb=493974, bsz=16717.8, num_updates=26000, lr=0.000392232, gnorm=0.188, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=24062 epoch 016: 705 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=545270, ups=1.1, wpb=493974, bsz=16717.8, num_updates=26000, lr=0.000392232, gnorm=0.188, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=24062 epoch 016: 705 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=545270, ups=1.1, wpb=493974, bsz=16717.8, num_updates=26000, lr=0.000392232, gnorm=0.188, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=24062 epoch 016: 705 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=545270, ups=1.1, wpb=493974, bsz=16717.8, num_updates=26000, lr=0.000392232, gnorm=0.188, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=24062 epoch 016: 705 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=545270, ups=1.1, wpb=493974, bsz=16717.8, num_updates=26000, lr=0.000392232, gnorm=0.188, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=24062 epoch 016: 705 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=545270, ups=1.1, wpb=493974, bsz=16717.8, num_updates=26000, lr=0.000392232, gnorm=0.188, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=24062 begin validation on "valid" subset epoch 016 | valid on 'valid' subset | loss 3.764 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.753 epoch 016 | valid on 'valid' subset | loss 3.764 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.753 epoch 016 | valid on 'valid' subset | loss 3.764 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.753 epoch 016 | valid on 'valid' subset | loss 3.764 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.753 epoch 016 | valid on 'valid' subset | loss 3.764 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.753 epoch 016 | valid on 'valid' subset | loss 3.764 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.753 epoch 016 | valid on 'valid' subset | loss 3.764 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.753 epoch 016 | valid on 'valid' subset | loss 3.764 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.753 epoch 016 | valid on 'valid' subset | loss 3.764 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.753 epoch 016 | valid on 'valid' subset | loss 3.764 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.753 epoch 016 | valid on 'valid' subset | loss 3.764 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.753 epoch 016 | valid on 'valid' subset | loss 3.764 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.753 epoch 016 | valid on 'valid' subset | loss 3.764 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.753 epoch 016 | valid on 'valid' subset | loss 3.764 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.753 epoch 016 | valid on 'valid' subset | loss 3.764 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.753 epoch 016 | valid on 'valid' subset | loss 3.764 | nll_loss 2.194 | ppl 4.58 | wps 0 | wpb 44526 | bsz 2008 | num_updates 26000 | best_loss 3.753 epoch 016: 806 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=483143, ups=0.98, wpb=494180, bsz=16560.4, num_updates=26100, lr=0.00039148, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=24164 epoch 016: 806 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=483143, ups=0.98, wpb=494180, bsz=16560.4, num_updates=26100, lr=0.00039148, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=24164 epoch 016: 806 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=483143, ups=0.98, wpb=494180, bsz=16560.4, num_updates=26100, lr=0.00039148, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=24164 epoch 016: 806 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=483143, ups=0.98, wpb=494180, bsz=16560.4, num_updates=26100, lr=0.00039148, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=24164 epoch 016: 806 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=483143, ups=0.98, wpb=494180, bsz=16560.4, num_updates=26100, lr=0.00039148, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=24164 epoch 016: 806 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=483143, ups=0.98, wpb=494180, bsz=16560.4, num_updates=26100, lr=0.00039148, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=24164 epoch 016: 806 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=483143, ups=0.98, wpb=494180, bsz=16560.4, num_updates=26100, lr=0.00039148, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=24164 epoch 016: 806 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=483143, ups=0.98, wpb=494180, bsz=16560.4, num_updates=26100, lr=0.00039148, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=24164 epoch 016: 806 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=483143, ups=0.98, wpb=494180, bsz=16560.4, num_updates=26100, lr=0.00039148, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=24164 epoch 016: 806 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=483143, ups=0.98, wpb=494180, bsz=16560.4, num_updates=26100, lr=0.00039148, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=24164 epoch 016: 806 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=483143, ups=0.98, wpb=494180, bsz=16560.4, num_updates=26100, lr=0.00039148, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=24164 epoch 016: 806 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=483143, ups=0.98, wpb=494180, bsz=16560.4, num_updates=26100, lr=0.00039148, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=24164 epoch 016: 806 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=483143, ups=0.98, wpb=494180, bsz=16560.4, num_updates=26100, lr=0.00039148, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=24164 epoch 016: 806 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=483143, ups=0.98, wpb=494180, bsz=16560.4, num_updates=26100, lr=0.00039148, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=24164 epoch 016: 806 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=483143, ups=0.98, wpb=494180, bsz=16560.4, num_updates=26100, lr=0.00039148, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=24164 epoch 016: 806 / 1689 loss=3.751, nll_loss=2.221, ppl=4.66, wps=483143, ups=0.98, wpb=494180, bsz=16560.4, num_updates=26100, lr=0.00039148, gnorm=0.183, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=24164 epoch 016: 906 / 1689 loss=3.752, nll_loss=2.222, ppl=4.67, wps=548332, ups=1.11, wpb=496020, bsz=16671, num_updates=26200, lr=0.000390732, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=24254 epoch 016: 906 / 1689 loss=3.752, nll_loss=2.222, ppl=4.67, wps=548332, ups=1.11, wpb=496020, bsz=16671, num_updates=26200, lr=0.000390732, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=24254 epoch 016: 906 / 1689 loss=3.752, nll_loss=2.222, ppl=4.67, wps=548332, ups=1.11, wpb=496020, bsz=16671, num_updates=26200, lr=0.000390732, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=24254 epoch 016: 906 / 1689 loss=3.752, nll_loss=2.222, ppl=4.67, wps=548332, ups=1.11, wpb=496020, bsz=16671, num_updates=26200, lr=0.000390732, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=24254 epoch 016: 906 / 1689 loss=3.752, nll_loss=2.222, ppl=4.67, wps=548332, ups=1.11, wpb=496020, bsz=16671, num_updates=26200, lr=0.000390732, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=24254 epoch 016: 906 / 1689 loss=3.752, nll_loss=2.222, ppl=4.67, wps=548332, ups=1.11, wpb=496020, bsz=16671, num_updates=26200, lr=0.000390732, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=24254 epoch 016: 906 / 1689 loss=3.752, nll_loss=2.222, ppl=4.67, wps=548332, ups=1.11, wpb=496020, bsz=16671, num_updates=26200, lr=0.000390732, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=24254 epoch 016: 906 / 1689 loss=3.752, nll_loss=2.222, ppl=4.67, wps=548332, ups=1.11, wpb=496020, bsz=16671, num_updates=26200, lr=0.000390732, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=24254 epoch 016: 906 / 1689 loss=3.752, nll_loss=2.222, ppl=4.67, wps=548332, ups=1.11, wpb=496020, bsz=16671, num_updates=26200, lr=0.000390732, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=24254 epoch 016: 906 / 1689 loss=3.752, nll_loss=2.222, ppl=4.67, wps=548332, ups=1.11, wpb=496020, bsz=16671, num_updates=26200, lr=0.000390732, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=24254 epoch 016: 906 / 1689 loss=3.752, nll_loss=2.222, ppl=4.67, wps=548332, ups=1.11, wpb=496020, bsz=16671, num_updates=26200, lr=0.000390732, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=24254 epoch 016: 906 / 1689 loss=3.752, nll_loss=2.222, ppl=4.67, wps=548332, ups=1.11, wpb=496020, bsz=16671, num_updates=26200, lr=0.000390732, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=24254 epoch 016: 906 / 1689 loss=3.752, nll_loss=2.222, ppl=4.67, wps=548332, ups=1.11, wpb=496020, bsz=16671, num_updates=26200, lr=0.000390732, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=24254 epoch 016: 906 / 1689 loss=3.752, nll_loss=2.222, ppl=4.67, wps=548332, ups=1.11, wpb=496020, bsz=16671, num_updates=26200, lr=0.000390732, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=24254 epoch 016: 906 / 1689 loss=3.752, nll_loss=2.222, ppl=4.67, wps=548332, ups=1.11, wpb=496020, bsz=16671, num_updates=26200, lr=0.000390732, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=24254 epoch 016: 906 / 1689 loss=3.752, nll_loss=2.222, ppl=4.67, wps=548332, ups=1.11, wpb=496020, bsz=16671, num_updates=26200, lr=0.000390732, gnorm=0.185, clip=0, loss_scale=2, train_wall=89, gb_free=22, wall=24254 epoch 016: 1006 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=551606, ups=1.11, wpb=495537, bsz=16293.8, num_updates=26300, lr=0.000389989, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=24344 epoch 016: 1006 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=551606, ups=1.11, wpb=495537, bsz=16293.8, num_updates=26300, lr=0.000389989, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=24344 epoch 016: 1006 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=551606, ups=1.11, wpb=495537, bsz=16293.8, num_updates=26300, lr=0.000389989, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=24344 epoch 016: 1006 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=551606, ups=1.11, wpb=495537, bsz=16293.8, num_updates=26300, lr=0.000389989, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=24344 epoch 016: 1006 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=551606, ups=1.11, wpb=495537, bsz=16293.8, num_updates=26300, lr=0.000389989, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=24344 epoch 016: 1006 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=551606, ups=1.11, wpb=495537, bsz=16293.8, num_updates=26300, lr=0.000389989, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=24344 epoch 016: 1006 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=551606, ups=1.11, wpb=495537, bsz=16293.8, num_updates=26300, lr=0.000389989, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=24344 epoch 016: 1006 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=551606, ups=1.11, wpb=495537, bsz=16293.8, num_updates=26300, lr=0.000389989, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=24344 epoch 016: 1006 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=551606, ups=1.11, wpb=495537, bsz=16293.8, num_updates=26300, lr=0.000389989, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=24344 epoch 016: 1006 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=551606, ups=1.11, wpb=495537, bsz=16293.8, num_updates=26300, lr=0.000389989, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=24344 epoch 016: 1006 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=551606, ups=1.11, wpb=495537, bsz=16293.8, num_updates=26300, lr=0.000389989, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=24344 epoch 016: 1006 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=551606, ups=1.11, wpb=495537, bsz=16293.8, num_updates=26300, lr=0.000389989, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=24344 epoch 016: 1006 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=551606, ups=1.11, wpb=495537, bsz=16293.8, num_updates=26300, lr=0.000389989, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=24344 epoch 016: 1006 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=551606, ups=1.11, wpb=495537, bsz=16293.8, num_updates=26300, lr=0.000389989, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=24344 epoch 016: 1006 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=551606, ups=1.11, wpb=495537, bsz=16293.8, num_updates=26300, lr=0.000389989, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=24344 epoch 016: 1006 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=551606, ups=1.11, wpb=495537, bsz=16293.8, num_updates=26300, lr=0.000389989, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=24344 epoch 016: 1107 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=544761, ups=1.1, wpb=496474, bsz=16558.6, num_updates=26400, lr=0.000389249, gnorm=0.181, clip=0, loss_scale=1, train_wall=90, gb_free=21.8, wall=24435 epoch 016: 1107 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=544761, ups=1.1, wpb=496474, bsz=16558.6, num_updates=26400, lr=0.000389249, gnorm=0.181, clip=0, loss_scale=1, train_wall=90, gb_free=21.8, wall=24435 epoch 016: 1107 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=544761, ups=1.1, wpb=496474, bsz=16558.6, num_updates=26400, lr=0.000389249, gnorm=0.181, clip=0, loss_scale=1, train_wall=90, gb_free=21.8, wall=24435 epoch 016: 1107 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=544761, ups=1.1, wpb=496474, bsz=16558.6, num_updates=26400, lr=0.000389249, gnorm=0.181, clip=0, loss_scale=1, train_wall=90, gb_free=21.8, wall=24435 epoch 016: 1107 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=544761, ups=1.1, wpb=496474, bsz=16558.6, num_updates=26400, lr=0.000389249, gnorm=0.181, clip=0, loss_scale=1, train_wall=90, gb_free=21.8, wall=24435 epoch 016: 1107 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=544761, ups=1.1, wpb=496474, bsz=16558.6, num_updates=26400, lr=0.000389249, gnorm=0.181, clip=0, loss_scale=1, train_wall=90, gb_free=21.8, wall=24435 epoch 016: 1107 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=544761, ups=1.1, wpb=496474, bsz=16558.6, num_updates=26400, lr=0.000389249, gnorm=0.181, clip=0, loss_scale=1, train_wall=90, gb_free=21.8, wall=24435 epoch 016: 1107 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=544761, ups=1.1, wpb=496474, bsz=16558.6, num_updates=26400, lr=0.000389249, gnorm=0.181, clip=0, loss_scale=1, train_wall=90, gb_free=21.8, wall=24435 epoch 016: 1107 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=544761, ups=1.1, wpb=496474, bsz=16558.6, num_updates=26400, lr=0.000389249, gnorm=0.181, clip=0, loss_scale=1, train_wall=90, gb_free=21.8, wall=24435 epoch 016: 1107 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=544761, ups=1.1, wpb=496474, bsz=16558.6, num_updates=26400, lr=0.000389249, gnorm=0.181, clip=0, loss_scale=1, train_wall=90, gb_free=21.8, wall=24435 epoch 016: 1107 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=544761, ups=1.1, wpb=496474, bsz=16558.6, num_updates=26400, lr=0.000389249, gnorm=0.181, clip=0, loss_scale=1, train_wall=90, gb_free=21.8, wall=24435 epoch 016: 1107 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=544761, ups=1.1, wpb=496474, bsz=16558.6, num_updates=26400, lr=0.000389249, gnorm=0.181, clip=0, loss_scale=1, train_wall=90, gb_free=21.8, wall=24435 epoch 016: 1107 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=544761, ups=1.1, wpb=496474, bsz=16558.6, num_updates=26400, lr=0.000389249, gnorm=0.181, clip=0, loss_scale=1, train_wall=90, gb_free=21.8, wall=24435 epoch 016: 1107 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=544761, ups=1.1, wpb=496474, bsz=16558.6, num_updates=26400, lr=0.000389249, gnorm=0.181, clip=0, loss_scale=1, train_wall=90, gb_free=21.8, wall=24435 epoch 016: 1107 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=544761, ups=1.1, wpb=496474, bsz=16558.6, num_updates=26400, lr=0.000389249, gnorm=0.181, clip=0, loss_scale=1, train_wall=90, gb_free=21.8, wall=24435 epoch 016: 1107 / 1689 loss=3.749, nll_loss=2.219, ppl=4.66, wps=544761, ups=1.1, wpb=496474, bsz=16558.6, num_updates=26400, lr=0.000389249, gnorm=0.181, clip=0, loss_scale=1, train_wall=90, gb_free=21.8, wall=24435 epoch 016: 1207 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=545008, ups=1.1, wpb=494836, bsz=17115, num_updates=26500, lr=0.000388514, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=24526 epoch 016: 1207 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=545008, ups=1.1, wpb=494836, bsz=17115, num_updates=26500, lr=0.000388514, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=24526 epoch 016: 1207 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=545008, ups=1.1, wpb=494836, bsz=17115, num_updates=26500, lr=0.000388514, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=24526 epoch 016: 1207 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=545008, ups=1.1, wpb=494836, bsz=17115, num_updates=26500, lr=0.000388514, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=24526 epoch 016: 1207 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=545008, ups=1.1, wpb=494836, bsz=17115, num_updates=26500, lr=0.000388514, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=24526 epoch 016: 1207 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=545008, ups=1.1, wpb=494836, bsz=17115, num_updates=26500, lr=0.000388514, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=24526 epoch 016: 1207 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=545008, ups=1.1, wpb=494836, bsz=17115, num_updates=26500, lr=0.000388514, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=24526 epoch 016: 1207 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=545008, ups=1.1, wpb=494836, bsz=17115, num_updates=26500, lr=0.000388514, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=24526 epoch 016: 1207 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=545008, ups=1.1, wpb=494836, bsz=17115, num_updates=26500, lr=0.000388514, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=24526 epoch 016: 1207 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=545008, ups=1.1, wpb=494836, bsz=17115, num_updates=26500, lr=0.000388514, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=24526 epoch 016: 1207 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=545008, ups=1.1, wpb=494836, bsz=17115, num_updates=26500, lr=0.000388514, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=24526 epoch 016: 1207 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=545008, ups=1.1, wpb=494836, bsz=17115, num_updates=26500, lr=0.000388514, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=24526 epoch 016: 1207 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=545008, ups=1.1, wpb=494836, bsz=17115, num_updates=26500, lr=0.000388514, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=24526 epoch 016: 1207 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=545008, ups=1.1, wpb=494836, bsz=17115, num_updates=26500, lr=0.000388514, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=24526 epoch 016: 1207 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=545008, ups=1.1, wpb=494836, bsz=17115, num_updates=26500, lr=0.000388514, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=24526 epoch 016: 1207 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=545008, ups=1.1, wpb=494836, bsz=17115, num_updates=26500, lr=0.000388514, gnorm=0.177, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=24526 epoch 016: 1307 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=544722, ups=1.1, wpb=496356, bsz=16694.7, num_updates=26600, lr=0.000387783, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=24617 epoch 016: 1307 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=544722, ups=1.1, wpb=496356, bsz=16694.7, num_updates=26600, lr=0.000387783, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=24617 epoch 016: 1307 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=544722, ups=1.1, wpb=496356, bsz=16694.7, num_updates=26600, lr=0.000387783, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=24617 epoch 016: 1307 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=544722, ups=1.1, wpb=496356, bsz=16694.7, num_updates=26600, lr=0.000387783, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=24617 epoch 016: 1307 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=544722, ups=1.1, wpb=496356, bsz=16694.7, num_updates=26600, lr=0.000387783, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=24617 epoch 016: 1307 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=544722, ups=1.1, wpb=496356, bsz=16694.7, num_updates=26600, lr=0.000387783, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=24617 epoch 016: 1307 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=544722, ups=1.1, wpb=496356, bsz=16694.7, num_updates=26600, lr=0.000387783, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=24617 epoch 016: 1307 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=544722, ups=1.1, wpb=496356, bsz=16694.7, num_updates=26600, lr=0.000387783, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=24617 epoch 016: 1307 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=544722, ups=1.1, wpb=496356, bsz=16694.7, num_updates=26600, lr=0.000387783, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=24617 epoch 016: 1307 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=544722, ups=1.1, wpb=496356, bsz=16694.7, num_updates=26600, lr=0.000387783, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=24617 epoch 016: 1307 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=544722, ups=1.1, wpb=496356, bsz=16694.7, num_updates=26600, lr=0.000387783, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=24617 epoch 016: 1307 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=544722, ups=1.1, wpb=496356, bsz=16694.7, num_updates=26600, lr=0.000387783, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=24617 epoch 016: 1307 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=544722, ups=1.1, wpb=496356, bsz=16694.7, num_updates=26600, lr=0.000387783, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=24617 epoch 016: 1307 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=544722, ups=1.1, wpb=496356, bsz=16694.7, num_updates=26600, lr=0.000387783, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=24617 epoch 016: 1307 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=544722, ups=1.1, wpb=496356, bsz=16694.7, num_updates=26600, lr=0.000387783, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=24617 epoch 016: 1307 / 1689 loss=3.75, nll_loss=2.22, ppl=4.66, wps=544722, ups=1.1, wpb=496356, bsz=16694.7, num_updates=26600, lr=0.000387783, gnorm=0.186, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=24617 epoch 016: 1407 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=552559, ups=1.12, wpb=495269, bsz=16465.6, num_updates=26700, lr=0.000387056, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24707 epoch 016: 1407 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=552559, ups=1.12, wpb=495269, bsz=16465.6, num_updates=26700, lr=0.000387056, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24707 epoch 016: 1407 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=552559, ups=1.12, wpb=495269, bsz=16465.6, num_updates=26700, lr=0.000387056, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24707 epoch 016: 1407 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=552559, ups=1.12, wpb=495269, bsz=16465.6, num_updates=26700, lr=0.000387056, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24707 epoch 016: 1407 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=552559, ups=1.12, wpb=495269, bsz=16465.6, num_updates=26700, lr=0.000387056, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24707 epoch 016: 1407 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=552559, ups=1.12, wpb=495269, bsz=16465.6, num_updates=26700, lr=0.000387056, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24707 epoch 016: 1407 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=552559, ups=1.12, wpb=495269, bsz=16465.6, num_updates=26700, lr=0.000387056, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24707 epoch 016: 1407 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=552559, ups=1.12, wpb=495269, bsz=16465.6, num_updates=26700, lr=0.000387056, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24707 epoch 016: 1407 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=552559, ups=1.12, wpb=495269, bsz=16465.6, num_updates=26700, lr=0.000387056, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24707 epoch 016: 1407 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=552559, ups=1.12, wpb=495269, bsz=16465.6, num_updates=26700, lr=0.000387056, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24707 epoch 016: 1407 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=552559, ups=1.12, wpb=495269, bsz=16465.6, num_updates=26700, lr=0.000387056, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24707 epoch 016: 1407 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=552559, ups=1.12, wpb=495269, bsz=16465.6, num_updates=26700, lr=0.000387056, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24707 epoch 016: 1407 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=552559, ups=1.12, wpb=495269, bsz=16465.6, num_updates=26700, lr=0.000387056, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24707 epoch 016: 1407 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=552559, ups=1.12, wpb=495269, bsz=16465.6, num_updates=26700, lr=0.000387056, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24707 epoch 016: 1407 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=552559, ups=1.12, wpb=495269, bsz=16465.6, num_updates=26700, lr=0.000387056, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24707 epoch 016: 1407 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=552559, ups=1.12, wpb=495269, bsz=16465.6, num_updates=26700, lr=0.000387056, gnorm=0.189, clip=0, loss_scale=1, train_wall=88, gb_free=21.9, wall=24707 epoch 016: 1507 / 1689 loss=3.751, nll_loss=2.222, ppl=4.66, wps=550072, ups=1.11, wpb=495387, bsz=16447.7, num_updates=26800, lr=0.000386334, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=24797 epoch 016: 1507 / 1689 loss=3.751, nll_loss=2.222, ppl=4.66, wps=550072, ups=1.11, wpb=495387, bsz=16447.7, num_updates=26800, lr=0.000386334, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=24797 epoch 016: 1507 / 1689 loss=3.751, nll_loss=2.222, ppl=4.66, wps=550072, ups=1.11, wpb=495387, bsz=16447.7, num_updates=26800, lr=0.000386334, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=24797 epoch 016: 1507 / 1689 loss=3.751, nll_loss=2.222, ppl=4.66, wps=550072, ups=1.11, wpb=495387, bsz=16447.7, num_updates=26800, lr=0.000386334, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=24797 epoch 016: 1507 / 1689 loss=3.751, nll_loss=2.222, ppl=4.66, wps=550072, ups=1.11, wpb=495387, bsz=16447.7, num_updates=26800, lr=0.000386334, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=24797 epoch 016: 1507 / 1689 loss=3.751, nll_loss=2.222, ppl=4.66, wps=550072, ups=1.11, wpb=495387, bsz=16447.7, num_updates=26800, lr=0.000386334, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=24797 epoch 016: 1507 / 1689 loss=3.751, nll_loss=2.222, ppl=4.66, wps=550072, ups=1.11, wpb=495387, bsz=16447.7, num_updates=26800, lr=0.000386334, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=24797 epoch 016: 1507 / 1689 loss=3.751, nll_loss=2.222, ppl=4.66, wps=550072, ups=1.11, wpb=495387, bsz=16447.7, num_updates=26800, lr=0.000386334, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=24797 epoch 016: 1507 / 1689 loss=3.751, nll_loss=2.222, ppl=4.66, wps=550072, ups=1.11, wpb=495387, bsz=16447.7, num_updates=26800, lr=0.000386334, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=24797 epoch 016: 1507 / 1689 loss=3.751, nll_loss=2.222, ppl=4.66, wps=550072, ups=1.11, wpb=495387, bsz=16447.7, num_updates=26800, lr=0.000386334, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=24797 epoch 016: 1507 / 1689 loss=3.751, nll_loss=2.222, ppl=4.66, wps=550072, ups=1.11, wpb=495387, bsz=16447.7, num_updates=26800, lr=0.000386334, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=24797 epoch 016: 1507 / 1689 loss=3.751, nll_loss=2.222, ppl=4.66, wps=550072, ups=1.11, wpb=495387, bsz=16447.7, num_updates=26800, lr=0.000386334, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=24797 epoch 016: 1507 / 1689 loss=3.751, nll_loss=2.222, ppl=4.66, wps=550072, ups=1.11, wpb=495387, bsz=16447.7, num_updates=26800, lr=0.000386334, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=24797 epoch 016: 1507 / 1689 loss=3.751, nll_loss=2.222, ppl=4.66, wps=550072, ups=1.11, wpb=495387, bsz=16447.7, num_updates=26800, lr=0.000386334, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=24797 epoch 016: 1507 / 1689 loss=3.751, nll_loss=2.222, ppl=4.66, wps=550072, ups=1.11, wpb=495387, bsz=16447.7, num_updates=26800, lr=0.000386334, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=24797 epoch 016: 1507 / 1689 loss=3.751, nll_loss=2.222, ppl=4.66, wps=550072, ups=1.11, wpb=495387, bsz=16447.7, num_updates=26800, lr=0.000386334, gnorm=0.179, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=24797 epoch 016: 1607 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=556016, ups=1.12, wpb=496636, bsz=16313.9, num_updates=26900, lr=0.000385615, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=24886 epoch 016: 1607 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=556016, ups=1.12, wpb=496636, bsz=16313.9, num_updates=26900, lr=0.000385615, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=24886 epoch 016: 1607 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=556016, ups=1.12, wpb=496636, bsz=16313.9, num_updates=26900, lr=0.000385615, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=24886 epoch 016: 1607 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=556016, ups=1.12, wpb=496636, bsz=16313.9, num_updates=26900, lr=0.000385615, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=24886 epoch 016: 1607 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=556016, ups=1.12, wpb=496636, bsz=16313.9, num_updates=26900, lr=0.000385615, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=24886 epoch 016: 1607 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=556016, ups=1.12, wpb=496636, bsz=16313.9, num_updates=26900, lr=0.000385615, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=24886 epoch 016: 1607 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=556016, ups=1.12, wpb=496636, bsz=16313.9, num_updates=26900, lr=0.000385615, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=24886 epoch 016: 1607 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=556016, ups=1.12, wpb=496636, bsz=16313.9, num_updates=26900, lr=0.000385615, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=24886 epoch 016: 1607 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=556016, ups=1.12, wpb=496636, bsz=16313.9, num_updates=26900, lr=0.000385615, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=24886 epoch 016: 1607 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=556016, ups=1.12, wpb=496636, bsz=16313.9, num_updates=26900, lr=0.000385615, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=24886 epoch 016: 1607 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=556016, ups=1.12, wpb=496636, bsz=16313.9, num_updates=26900, lr=0.000385615, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=24886 epoch 016: 1607 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=556016, ups=1.12, wpb=496636, bsz=16313.9, num_updates=26900, lr=0.000385615, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=24886 epoch 016: 1607 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=556016, ups=1.12, wpb=496636, bsz=16313.9, num_updates=26900, lr=0.000385615, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=24886 epoch 016: 1607 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=556016, ups=1.12, wpb=496636, bsz=16313.9, num_updates=26900, lr=0.000385615, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=24886 epoch 016: 1607 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=556016, ups=1.12, wpb=496636, bsz=16313.9, num_updates=26900, lr=0.000385615, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=24886 epoch 016: 1607 / 1689 loss=3.751, nll_loss=2.222, ppl=4.67, wps=556016, ups=1.12, wpb=496636, bsz=16313.9, num_updates=26900, lr=0.000385615, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=24886 end of epoch 16 (average epoch stats below) epoch 016 | loss 3.749 | nll_loss 2.219 | ppl 4.66 | wps 544039 | ups 1.1 | wpb 495119 | bsz 16505.7 | num_updates 26982 | lr 0.000385029 | gnorm 0.184 | clip 0 | loss_scale 2 | train_wall 1495 | gb_free 23.1 | wall 24959 epoch 016 | loss 3.749 | nll_loss 2.219 | ppl 4.66 | wps 544039 | ups 1.1 | wpb 495119 | bsz 16505.7 | num_updates 26982 | lr 0.000385029 | gnorm 0.184 | clip 0 | loss_scale 2 | train_wall 1495 | gb_free 23.1 | wall 24959 epoch 016 | loss 3.749 | nll_loss 2.219 | ppl 4.66 | wps 544039 | ups 1.1 | wpb 495119 | bsz 16505.7 | num_updates 26982 | lr 0.000385029 | gnorm 0.184 | clip 0 | loss_scale 2 | train_wall 1495 | gb_free 23.1 | wall 24959 epoch 016 | loss 3.749 | nll_loss 2.219 | ppl 4.66 | wps 544039 | ups 1.1 | wpb 495119 | bsz 16505.7 | num_updates 26982 | lr 0.000385029 | gnorm 0.184 | clip 0 | loss_scale 2 | train_wall 1495 | gb_free 23.1 | wall 24959 epoch 016 | loss 3.749 | nll_loss 2.219 | ppl 4.66 | wps 544039 | ups 1.1 | wpb 495119 | bsz 16505.7 | num_updates 26982 | lr 0.000385029 | gnorm 0.184 | clip 0 | loss_scale 2 | train_wall 1495 | gb_free 23.1 | wall 24959 epoch 016 | loss 3.749 | nll_loss 2.219 | ppl 4.66 | wps 544039 | ups 1.1 | wpb 495119 | bsz 16505.7 | num_updates 26982 | lr 0.000385029 | gnorm 0.184 | clip 0 | loss_scale 2 | train_wall 1495 | gb_free 23.1 | wall 24959 epoch 016 | loss 3.749 | nll_loss 2.219 | ppl 4.66 | wps 544039 | ups 1.1 | wpb 495119 | bsz 16505.7 | num_updates 26982 | lr 0.000385029 | gnorm 0.184 | clip 0 | loss_scale 2 | train_wall 1495 | gb_free 23.1 | wall 24959 epoch 016 | loss 3.749 | nll_loss 2.219 | ppl 4.66 | wps 544039 | ups 1.1 | wpb 495119 | bsz 16505.7 | num_updates 26982 | lr 0.000385029 | gnorm 0.184 | clip 0 | loss_scale 2 | train_wall 1495 | gb_free 23.1 | wall 24959 epoch 016 | loss 3.749 | nll_loss 2.219 | ppl 4.66 | wps 544039 | ups 1.1 | wpb 495119 | bsz 16505.7 | num_updates 26982 | lr 0.000385029 | gnorm 0.184 | clip 0 | loss_scale 2 | train_wall 1495 | gb_free 23.1 | wall 24959 epoch 016 | loss 3.749 | nll_loss 2.219 | ppl 4.66 | wps 544039 | ups 1.1 | wpb 495119 | bsz 16505.7 | num_updates 26982 | lr 0.000385029 | gnorm 0.184 | clip 0 | loss_scale 2 | train_wall 1495 | gb_free 23.1 | wall 24959 epoch 016 | loss 3.749 | nll_loss 2.219 | ppl 4.66 | wps 544039 | ups 1.1 | wpb 495119 | bsz 16505.7 | num_updates 26982 | lr 0.000385029 | gnorm 0.184 | clip 0 | loss_scale 2 | train_wall 1495 | gb_free 23.1 | wall 24959 epoch 016 | loss 3.749 | nll_loss 2.219 | ppl 4.66 | wps 544039 | ups 1.1 | wpb 495119 | bsz 16505.7 | num_updates 26982 | lr 0.000385029 | gnorm 0.184 | clip 0 | loss_scale 2 | train_wall 1495 | gb_free 23.1 | wall 24959 epoch 016 | loss 3.749 | nll_loss 2.219 | ppl 4.66 | wps 544039 | ups 1.1 | wpb 495119 | bsz 16505.7 | num_updates 26982 | lr 0.000385029 | gnorm 0.184 | clip 0 | loss_scale 2 | train_wall 1495 | gb_free 23.1 | wall 24959 epoch 016 | loss 3.749 | nll_loss 2.219 | ppl 4.66 | wps 544039 | ups 1.1 | wpb 495119 | bsz 16505.7 | num_updates 26982 | lr 0.000385029 | gnorm 0.184 | clip 0 | loss_scale 2 | train_wall 1495 | gb_free 23.1 | wall 24959 epoch 016 | loss 3.749 | nll_loss 2.219 | ppl 4.66 | wps 544039 | ups 1.1 | wpb 495119 | bsz 16505.7 | num_updates 26982 | lr 0.000385029 | gnorm 0.184 | clip 0 | loss_scale 2 | train_wall 1495 | gb_free 23.1 | wall 24959 epoch 016 | loss 3.749 | nll_loss 2.219 | ppl 4.66 | wps 544039 | ups 1.1 | wpb 495119 | bsz 16505.7 | num_updates 26982 | lr 0.000385029 | gnorm 0.184 | clip 0 | loss_scale 2 | train_wall 1495 | gb_free 23.1 | wall 24959 Start iterating over samples epoch 017: 18 / 1689 loss=3.75, nll_loss=2.221, ppl=4.66, wps=541737, ups=1.1, wpb=490691, bsz=16612.4, num_updates=27000, lr=0.0003849, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=24977 epoch 017: 18 / 1689 loss=3.75, nll_loss=2.221, ppl=4.66, wps=541737, ups=1.1, wpb=490691, bsz=16612.4, num_updates=27000, lr=0.0003849, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=24977 epoch 017: 18 / 1689 loss=3.75, nll_loss=2.221, ppl=4.66, wps=541737, ups=1.1, wpb=490691, bsz=16612.4, num_updates=27000, lr=0.0003849, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=24977 epoch 017: 18 / 1689 loss=3.75, nll_loss=2.221, ppl=4.66, wps=541737, ups=1.1, wpb=490691, bsz=16612.4, num_updates=27000, lr=0.0003849, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=24977 epoch 017: 18 / 1689 loss=3.75, nll_loss=2.221, ppl=4.66, wps=541737, ups=1.1, wpb=490691, bsz=16612.4, num_updates=27000, lr=0.0003849, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=24977 epoch 017: 18 / 1689 loss=3.75, nll_loss=2.221, ppl=4.66, wps=541737, ups=1.1, wpb=490691, bsz=16612.4, num_updates=27000, lr=0.0003849, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=24977 epoch 017: 18 / 1689 loss=3.75, nll_loss=2.221, ppl=4.66, wps=541737, ups=1.1, wpb=490691, bsz=16612.4, num_updates=27000, lr=0.0003849, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=24977 epoch 017: 18 / 1689 loss=3.75, nll_loss=2.221, ppl=4.66, wps=541737, ups=1.1, wpb=490691, bsz=16612.4, num_updates=27000, lr=0.0003849, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=24977 epoch 017: 18 / 1689 loss=3.75, nll_loss=2.221, ppl=4.66, wps=541737, ups=1.1, wpb=490691, bsz=16612.4, num_updates=27000, lr=0.0003849, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=24977 epoch 017: 18 / 1689 loss=3.75, nll_loss=2.221, ppl=4.66, wps=541737, ups=1.1, wpb=490691, bsz=16612.4, num_updates=27000, lr=0.0003849, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=24977 epoch 017: 18 / 1689 loss=3.75, nll_loss=2.221, ppl=4.66, wps=541737, ups=1.1, wpb=490691, bsz=16612.4, num_updates=27000, lr=0.0003849, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=24977 epoch 017: 18 / 1689 loss=3.75, nll_loss=2.221, ppl=4.66, wps=541737, ups=1.1, wpb=490691, bsz=16612.4, num_updates=27000, lr=0.0003849, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=24977 epoch 017: 18 / 1689 loss=3.75, nll_loss=2.221, ppl=4.66, wps=541737, ups=1.1, wpb=490691, bsz=16612.4, num_updates=27000, lr=0.0003849, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=24977 epoch 017: 18 / 1689 loss=3.75, nll_loss=2.221, ppl=4.66, wps=541737, ups=1.1, wpb=490691, bsz=16612.4, num_updates=27000, lr=0.0003849, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=24977 epoch 017: 18 / 1689 loss=3.75, nll_loss=2.221, ppl=4.66, wps=541737, ups=1.1, wpb=490691, bsz=16612.4, num_updates=27000, lr=0.0003849, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=24977 epoch 017: 18 / 1689 loss=3.75, nll_loss=2.221, ppl=4.66, wps=541737, ups=1.1, wpb=490691, bsz=16612.4, num_updates=27000, lr=0.0003849, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=24977 epoch 017: 18 / 1689 loss=3.75, nll_loss=2.221, ppl=4.66, wps=541737, ups=1.1, wpb=490691, bsz=16612.4, num_updates=27000, lr=0.0003849, gnorm=0.187, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=24977 begin validation on "valid" subset epoch 017 | valid on 'valid' subset | loss 3.759 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.753 epoch 017 | valid on 'valid' subset | loss 3.759 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.753 epoch 017 | valid on 'valid' subset | loss 3.759 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.753 epoch 017 | valid on 'valid' subset | loss 3.759 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.753 epoch 017 | valid on 'valid' subset | loss 3.759 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.753 epoch 017 | valid on 'valid' subset | loss 3.759 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.753 epoch 017 | valid on 'valid' subset | loss 3.759 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.753 epoch 017 | valid on 'valid' subset | loss 3.759 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.753 epoch 017 | valid on 'valid' subset | loss 3.759 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.753 epoch 017 | valid on 'valid' subset | loss 3.759 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.753 epoch 017 | valid on 'valid' subset | loss 3.759 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.753 epoch 017 | valid on 'valid' subset | loss 3.759 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.753 epoch 017 | valid on 'valid' subset | loss 3.759 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.753 epoch 017 | valid on 'valid' subset | loss 3.759 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.753 epoch 017 | valid on 'valid' subset | loss 3.759 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.753 epoch 017 | valid on 'valid' subset | loss 3.759 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.753 epoch 017 | valid on 'valid' subset | loss 3.759 | nll_loss 2.191 | ppl 4.57 | wps 0 | wpb 44526 | bsz 2008 | num_updates 27000 | best_loss 3.753 epoch 017: 118 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=482148, ups=0.97, wpb=496080, bsz=16491.1, num_updates=27100, lr=0.000384189, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=25080 epoch 017: 118 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=482148, ups=0.97, wpb=496080, bsz=16491.1, num_updates=27100, lr=0.000384189, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=25080 epoch 017: 118 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=482148, ups=0.97, wpb=496080, bsz=16491.1, num_updates=27100, lr=0.000384189, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=25080 epoch 017: 118 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=482148, ups=0.97, wpb=496080, bsz=16491.1, num_updates=27100, lr=0.000384189, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=25080 epoch 017: 118 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=482148, ups=0.97, wpb=496080, bsz=16491.1, num_updates=27100, lr=0.000384189, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=25080 epoch 017: 118 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=482148, ups=0.97, wpb=496080, bsz=16491.1, num_updates=27100, lr=0.000384189, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=25080 epoch 017: 118 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=482148, ups=0.97, wpb=496080, bsz=16491.1, num_updates=27100, lr=0.000384189, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=25080 epoch 017: 118 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=482148, ups=0.97, wpb=496080, bsz=16491.1, num_updates=27100, lr=0.000384189, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=25080 epoch 017: 118 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=482148, ups=0.97, wpb=496080, bsz=16491.1, num_updates=27100, lr=0.000384189, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=25080 epoch 017: 118 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=482148, ups=0.97, wpb=496080, bsz=16491.1, num_updates=27100, lr=0.000384189, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=25080 epoch 017: 118 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=482148, ups=0.97, wpb=496080, bsz=16491.1, num_updates=27100, lr=0.000384189, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=25080 epoch 017: 118 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=482148, ups=0.97, wpb=496080, bsz=16491.1, num_updates=27100, lr=0.000384189, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=25080 epoch 017: 118 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=482148, ups=0.97, wpb=496080, bsz=16491.1, num_updates=27100, lr=0.000384189, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=25080 epoch 017: 118 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=482148, ups=0.97, wpb=496080, bsz=16491.1, num_updates=27100, lr=0.000384189, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=25080 epoch 017: 118 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=482148, ups=0.97, wpb=496080, bsz=16491.1, num_updates=27100, lr=0.000384189, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=25080 epoch 017: 118 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=482148, ups=0.97, wpb=496080, bsz=16491.1, num_updates=27100, lr=0.000384189, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=25080 epoch 017: 118 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=482148, ups=0.97, wpb=496080, bsz=16491.1, num_updates=27100, lr=0.000384189, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.1, wall=25080 epoch 017: 218 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=551516, ups=1.11, wpb=495141, bsz=16344.7, num_updates=27200, lr=0.000383482, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=25169 epoch 017: 218 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=551516, ups=1.11, wpb=495141, bsz=16344.7, num_updates=27200, lr=0.000383482, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=25169 epoch 017: 218 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=551516, ups=1.11, wpb=495141, bsz=16344.7, num_updates=27200, lr=0.000383482, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=25169 epoch 017: 218 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=551516, ups=1.11, wpb=495141, bsz=16344.7, num_updates=27200, lr=0.000383482, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=25169 epoch 017: 218 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=551516, ups=1.11, wpb=495141, bsz=16344.7, num_updates=27200, lr=0.000383482, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=25169 epoch 017: 218 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=551516, ups=1.11, wpb=495141, bsz=16344.7, num_updates=27200, lr=0.000383482, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=25169 epoch 017: 218 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=551516, ups=1.11, wpb=495141, bsz=16344.7, num_updates=27200, lr=0.000383482, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=25169 epoch 017: 218 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=551516, ups=1.11, wpb=495141, bsz=16344.7, num_updates=27200, lr=0.000383482, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=25169 epoch 017: 218 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=551516, ups=1.11, wpb=495141, bsz=16344.7, num_updates=27200, lr=0.000383482, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=25169 epoch 017: 218 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=551516, ups=1.11, wpb=495141, bsz=16344.7, num_updates=27200, lr=0.000383482, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=25169 epoch 017: 218 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=551516, ups=1.11, wpb=495141, bsz=16344.7, num_updates=27200, lr=0.000383482, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=25169 epoch 017: 218 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=551516, ups=1.11, wpb=495141, bsz=16344.7, num_updates=27200, lr=0.000383482, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=25169 epoch 017: 218 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=551516, ups=1.11, wpb=495141, bsz=16344.7, num_updates=27200, lr=0.000383482, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=25169 epoch 017: 218 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=551516, ups=1.11, wpb=495141, bsz=16344.7, num_updates=27200, lr=0.000383482, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=25169 epoch 017: 218 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=551516, ups=1.11, wpb=495141, bsz=16344.7, num_updates=27200, lr=0.000383482, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=25169 epoch 017: 218 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=551516, ups=1.11, wpb=495141, bsz=16344.7, num_updates=27200, lr=0.000383482, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=25169 epoch 017: 218 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=551516, ups=1.11, wpb=495141, bsz=16344.7, num_updates=27200, lr=0.000383482, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=25169 epoch 017: 318 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=546336, ups=1.11, wpb=494354, bsz=16529.4, num_updates=27300, lr=0.00038278, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=25260 epoch 017: 318 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=546336, ups=1.11, wpb=494354, bsz=16529.4, num_updates=27300, lr=0.00038278, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=25260 epoch 017: 318 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=546336, ups=1.11, wpb=494354, bsz=16529.4, num_updates=27300, lr=0.00038278, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=25260 epoch 017: 318 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=546336, ups=1.11, wpb=494354, bsz=16529.4, num_updates=27300, lr=0.00038278, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=25260 epoch 017: 318 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=546336, ups=1.11, wpb=494354, bsz=16529.4, num_updates=27300, lr=0.00038278, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=25260 epoch 017: 318 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=546336, ups=1.11, wpb=494354, bsz=16529.4, num_updates=27300, lr=0.00038278, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=25260 epoch 017: 318 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=546336, ups=1.11, wpb=494354, bsz=16529.4, num_updates=27300, lr=0.00038278, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=25260 epoch 017: 318 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=546336, ups=1.11, wpb=494354, bsz=16529.4, num_updates=27300, lr=0.00038278, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=25260 epoch 017: 318 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=546336, ups=1.11, wpb=494354, bsz=16529.4, num_updates=27300, lr=0.00038278, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=25260 epoch 017: 318 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=546336, ups=1.11, wpb=494354, bsz=16529.4, num_updates=27300, lr=0.00038278, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=25260 epoch 017: 318 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=546336, ups=1.11, wpb=494354, bsz=16529.4, num_updates=27300, lr=0.00038278, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=25260 epoch 017: 318 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=546336, ups=1.11, wpb=494354, bsz=16529.4, num_updates=27300, lr=0.00038278, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=25260 epoch 017: 318 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=546336, ups=1.11, wpb=494354, bsz=16529.4, num_updates=27300, lr=0.00038278, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=25260 epoch 017: 318 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=546336, ups=1.11, wpb=494354, bsz=16529.4, num_updates=27300, lr=0.00038278, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=25260 epoch 017: 318 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=546336, ups=1.11, wpb=494354, bsz=16529.4, num_updates=27300, lr=0.00038278, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=25260 epoch 017: 318 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=546336, ups=1.11, wpb=494354, bsz=16529.4, num_updates=27300, lr=0.00038278, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=25260 epoch 017: 318 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=546336, ups=1.11, wpb=494354, bsz=16529.4, num_updates=27300, lr=0.00038278, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=25260 epoch 017: 418 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550148, ups=1.11, wpb=495941, bsz=16605, num_updates=27400, lr=0.00038208, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25350 epoch 017: 418 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550148, ups=1.11, wpb=495941, bsz=16605, num_updates=27400, lr=0.00038208, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25350 epoch 017: 418 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550148, ups=1.11, wpb=495941, bsz=16605, num_updates=27400, lr=0.00038208, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25350 epoch 017: 418 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550148, ups=1.11, wpb=495941, bsz=16605, num_updates=27400, lr=0.00038208, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25350 epoch 017: 418 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550148, ups=1.11, wpb=495941, bsz=16605, num_updates=27400, lr=0.00038208, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25350 epoch 017: 418 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550148, ups=1.11, wpb=495941, bsz=16605, num_updates=27400, lr=0.00038208, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25350 epoch 017: 418 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550148, ups=1.11, wpb=495941, bsz=16605, num_updates=27400, lr=0.00038208, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25350 epoch 017: 418 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550148, ups=1.11, wpb=495941, bsz=16605, num_updates=27400, lr=0.00038208, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25350 epoch 017: 418 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550148, ups=1.11, wpb=495941, bsz=16605, num_updates=27400, lr=0.00038208, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25350 epoch 017: 418 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550148, ups=1.11, wpb=495941, bsz=16605, num_updates=27400, lr=0.00038208, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25350 epoch 017: 418 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550148, ups=1.11, wpb=495941, bsz=16605, num_updates=27400, lr=0.00038208, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25350 epoch 017: 418 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550148, ups=1.11, wpb=495941, bsz=16605, num_updates=27400, lr=0.00038208, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25350 epoch 017: 418 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550148, ups=1.11, wpb=495941, bsz=16605, num_updates=27400, lr=0.00038208, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25350 epoch 017: 418 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550148, ups=1.11, wpb=495941, bsz=16605, num_updates=27400, lr=0.00038208, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25350 epoch 017: 418 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550148, ups=1.11, wpb=495941, bsz=16605, num_updates=27400, lr=0.00038208, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25350 epoch 017: 418 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550148, ups=1.11, wpb=495941, bsz=16605, num_updates=27400, lr=0.00038208, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25350 epoch 017: 418 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550148, ups=1.11, wpb=495941, bsz=16605, num_updates=27400, lr=0.00038208, gnorm=0.19, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=25350 epoch 017: 518 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=549125, ups=1.11, wpb=494230, bsz=16426.9, num_updates=27500, lr=0.000381385, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=25440 epoch 017: 518 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=549125, ups=1.11, wpb=494230, bsz=16426.9, num_updates=27500, lr=0.000381385, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=25440 epoch 017: 518 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=549125, ups=1.11, wpb=494230, bsz=16426.9, num_updates=27500, lr=0.000381385, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=25440 epoch 017: 518 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=549125, ups=1.11, wpb=494230, bsz=16426.9, num_updates=27500, lr=0.000381385, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=25440 epoch 017: 518 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=549125, ups=1.11, wpb=494230, bsz=16426.9, num_updates=27500, lr=0.000381385, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=25440 epoch 017: 518 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=549125, ups=1.11, wpb=494230, bsz=16426.9, num_updates=27500, lr=0.000381385, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=25440 epoch 017: 518 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=549125, ups=1.11, wpb=494230, bsz=16426.9, num_updates=27500, lr=0.000381385, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=25440 epoch 017: 518 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=549125, ups=1.11, wpb=494230, bsz=16426.9, num_updates=27500, lr=0.000381385, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=25440 epoch 017: 518 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=549125, ups=1.11, wpb=494230, bsz=16426.9, num_updates=27500, lr=0.000381385, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=25440 epoch 017: 518 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=549125, ups=1.11, wpb=494230, bsz=16426.9, num_updates=27500, lr=0.000381385, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=25440 epoch 017: 518 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=549125, ups=1.11, wpb=494230, bsz=16426.9, num_updates=27500, lr=0.000381385, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=25440 epoch 017: 518 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=549125, ups=1.11, wpb=494230, bsz=16426.9, num_updates=27500, lr=0.000381385, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=25440 epoch 017: 518 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=549125, ups=1.11, wpb=494230, bsz=16426.9, num_updates=27500, lr=0.000381385, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=25440 epoch 017: 518 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=549125, ups=1.11, wpb=494230, bsz=16426.9, num_updates=27500, lr=0.000381385, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=25440 epoch 017: 518 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=549125, ups=1.11, wpb=494230, bsz=16426.9, num_updates=27500, lr=0.000381385, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=25440 epoch 017: 518 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=549125, ups=1.11, wpb=494230, bsz=16426.9, num_updates=27500, lr=0.000381385, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=25440 epoch 017: 518 / 1689 loss=3.748, nll_loss=2.218, ppl=4.65, wps=549125, ups=1.11, wpb=494230, bsz=16426.9, num_updates=27500, lr=0.000381385, gnorm=0.184, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=25440 epoch 017: 618 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=545911, ups=1.11, wpb=493679, bsz=16239, num_updates=27600, lr=0.000380693, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=25531 epoch 017: 618 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=545911, ups=1.11, wpb=493679, bsz=16239, num_updates=27600, lr=0.000380693, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=25531 epoch 017: 618 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=545911, ups=1.11, wpb=493679, bsz=16239, num_updates=27600, lr=0.000380693, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=25531 epoch 017: 618 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=545911, ups=1.11, wpb=493679, bsz=16239, num_updates=27600, lr=0.000380693, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=25531 epoch 017: 618 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=545911, ups=1.11, wpb=493679, bsz=16239, num_updates=27600, lr=0.000380693, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=25531 epoch 017: 618 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=545911, ups=1.11, wpb=493679, bsz=16239, num_updates=27600, lr=0.000380693, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=25531 epoch 017: 618 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=545911, ups=1.11, wpb=493679, bsz=16239, num_updates=27600, lr=0.000380693, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=25531 epoch 017: 618 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=545911, ups=1.11, wpb=493679, bsz=16239, num_updates=27600, lr=0.000380693, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=25531 epoch 017: 618 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=545911, ups=1.11, wpb=493679, bsz=16239, num_updates=27600, lr=0.000380693, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=25531 epoch 017: 618 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=545911, ups=1.11, wpb=493679, bsz=16239, num_updates=27600, lr=0.000380693, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=25531 epoch 017: 618 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=545911, ups=1.11, wpb=493679, bsz=16239, num_updates=27600, lr=0.000380693, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=25531 epoch 017: 618 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=545911, ups=1.11, wpb=493679, bsz=16239, num_updates=27600, lr=0.000380693, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=25531 epoch 017: 618 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=545911, ups=1.11, wpb=493679, bsz=16239, num_updates=27600, lr=0.000380693, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=25531 epoch 017: 618 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=545911, ups=1.11, wpb=493679, bsz=16239, num_updates=27600, lr=0.000380693, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=25531 epoch 017: 618 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=545911, ups=1.11, wpb=493679, bsz=16239, num_updates=27600, lr=0.000380693, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=25531 epoch 017: 618 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=545911, ups=1.11, wpb=493679, bsz=16239, num_updates=27600, lr=0.000380693, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=25531 epoch 017: 618 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=545911, ups=1.11, wpb=493679, bsz=16239, num_updates=27600, lr=0.000380693, gnorm=0.185, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=25531 epoch 017: 718 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=552804, ups=1.12, wpb=495536, bsz=16267.2, num_updates=27700, lr=0.000380006, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25620 epoch 017: 718 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=552804, ups=1.12, wpb=495536, bsz=16267.2, num_updates=27700, lr=0.000380006, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25620 epoch 017: 718 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=552804, ups=1.12, wpb=495536, bsz=16267.2, num_updates=27700, lr=0.000380006, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25620 epoch 017: 718 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=552804, ups=1.12, wpb=495536, bsz=16267.2, num_updates=27700, lr=0.000380006, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25620 epoch 017: 718 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=552804, ups=1.12, wpb=495536, bsz=16267.2, num_updates=27700, lr=0.000380006, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25620 epoch 017: 718 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=552804, ups=1.12, wpb=495536, bsz=16267.2, num_updates=27700, lr=0.000380006, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25620 epoch 017: 718 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=552804, ups=1.12, wpb=495536, bsz=16267.2, num_updates=27700, lr=0.000380006, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25620 epoch 017: 718 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=552804, ups=1.12, wpb=495536, bsz=16267.2, num_updates=27700, lr=0.000380006, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25620 epoch 017: 718 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=552804, ups=1.12, wpb=495536, bsz=16267.2, num_updates=27700, lr=0.000380006, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25620 epoch 017: 718 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=552804, ups=1.12, wpb=495536, bsz=16267.2, num_updates=27700, lr=0.000380006, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25620 epoch 017: 718 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=552804, ups=1.12, wpb=495536, bsz=16267.2, num_updates=27700, lr=0.000380006, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25620 epoch 017: 718 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=552804, ups=1.12, wpb=495536, bsz=16267.2, num_updates=27700, lr=0.000380006, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25620 epoch 017: 718 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=552804, ups=1.12, wpb=495536, bsz=16267.2, num_updates=27700, lr=0.000380006, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25620 epoch 017: 718 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=552804, ups=1.12, wpb=495536, bsz=16267.2, num_updates=27700, lr=0.000380006, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25620 epoch 017: 718 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=552804, ups=1.12, wpb=495536, bsz=16267.2, num_updates=27700, lr=0.000380006, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25620 epoch 017: 718 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=552804, ups=1.12, wpb=495536, bsz=16267.2, num_updates=27700, lr=0.000380006, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25620 epoch 017: 718 / 1689 loss=3.742, nll_loss=2.211, ppl=4.63, wps=552804, ups=1.12, wpb=495536, bsz=16267.2, num_updates=27700, lr=0.000380006, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25620 epoch 017: 818 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=550432, ups=1.11, wpb=496444, bsz=17029.9, num_updates=27800, lr=0.000379322, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25710 epoch 017: 818 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=550432, ups=1.11, wpb=496444, bsz=17029.9, num_updates=27800, lr=0.000379322, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25710 epoch 017: 818 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=550432, ups=1.11, wpb=496444, bsz=17029.9, num_updates=27800, lr=0.000379322, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25710 epoch 017: 818 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=550432, ups=1.11, wpb=496444, bsz=17029.9, num_updates=27800, lr=0.000379322, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25710 epoch 017: 818 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=550432, ups=1.11, wpb=496444, bsz=17029.9, num_updates=27800, lr=0.000379322, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25710 epoch 017: 818 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=550432, ups=1.11, wpb=496444, bsz=17029.9, num_updates=27800, lr=0.000379322, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25710 epoch 017: 818 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=550432, ups=1.11, wpb=496444, bsz=17029.9, num_updates=27800, lr=0.000379322, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25710 epoch 017: 818 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=550432, ups=1.11, wpb=496444, bsz=17029.9, num_updates=27800, lr=0.000379322, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25710 epoch 017: 818 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=550432, ups=1.11, wpb=496444, bsz=17029.9, num_updates=27800, lr=0.000379322, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25710 epoch 017: 818 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=550432, ups=1.11, wpb=496444, bsz=17029.9, num_updates=27800, lr=0.000379322, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25710 epoch 017: 818 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=550432, ups=1.11, wpb=496444, bsz=17029.9, num_updates=27800, lr=0.000379322, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25710 epoch 017: 818 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=550432, ups=1.11, wpb=496444, bsz=17029.9, num_updates=27800, lr=0.000379322, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25710 epoch 017: 818 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=550432, ups=1.11, wpb=496444, bsz=17029.9, num_updates=27800, lr=0.000379322, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25710 epoch 017: 818 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=550432, ups=1.11, wpb=496444, bsz=17029.9, num_updates=27800, lr=0.000379322, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25710 epoch 017: 818 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=550432, ups=1.11, wpb=496444, bsz=17029.9, num_updates=27800, lr=0.000379322, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25710 epoch 017: 818 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=550432, ups=1.11, wpb=496444, bsz=17029.9, num_updates=27800, lr=0.000379322, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25710 epoch 017: 818 / 1689 loss=3.741, nll_loss=2.211, ppl=4.63, wps=550432, ups=1.11, wpb=496444, bsz=17029.9, num_updates=27800, lr=0.000379322, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=25710 epoch 017: 918 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=548239, ups=1.11, wpb=495619, bsz=16670, num_updates=27900, lr=0.000378641, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=25801 epoch 017: 918 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=548239, ups=1.11, wpb=495619, bsz=16670, num_updates=27900, lr=0.000378641, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=25801 epoch 017: 918 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=548239, ups=1.11, wpb=495619, bsz=16670, num_updates=27900, lr=0.000378641, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=25801 epoch 017: 918 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=548239, ups=1.11, wpb=495619, bsz=16670, num_updates=27900, lr=0.000378641, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=25801 epoch 017: 918 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=548239, ups=1.11, wpb=495619, bsz=16670, num_updates=27900, lr=0.000378641, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=25801 epoch 017: 918 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=548239, ups=1.11, wpb=495619, bsz=16670, num_updates=27900, lr=0.000378641, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=25801 epoch 017: 918 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=548239, ups=1.11, wpb=495619, bsz=16670, num_updates=27900, lr=0.000378641, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=25801 epoch 017: 918 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=548239, ups=1.11, wpb=495619, bsz=16670, num_updates=27900, lr=0.000378641, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=25801 epoch 017: 918 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=548239, ups=1.11, wpb=495619, bsz=16670, num_updates=27900, lr=0.000378641, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=25801 epoch 017: 918 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=548239, ups=1.11, wpb=495619, bsz=16670, num_updates=27900, lr=0.000378641, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=25801 epoch 017: 918 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=548239, ups=1.11, wpb=495619, bsz=16670, num_updates=27900, lr=0.000378641, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=25801 epoch 017: 918 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=548239, ups=1.11, wpb=495619, bsz=16670, num_updates=27900, lr=0.000378641, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=25801 epoch 017: 918 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=548239, ups=1.11, wpb=495619, bsz=16670, num_updates=27900, lr=0.000378641, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=25801 epoch 017: 918 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=548239, ups=1.11, wpb=495619, bsz=16670, num_updates=27900, lr=0.000378641, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=25801 epoch 017: 918 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=548239, ups=1.11, wpb=495619, bsz=16670, num_updates=27900, lr=0.000378641, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=25801 epoch 017: 918 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=548239, ups=1.11, wpb=495619, bsz=16670, num_updates=27900, lr=0.000378641, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=25801 epoch 017: 918 / 1689 loss=3.742, nll_loss=2.212, ppl=4.63, wps=548239, ups=1.11, wpb=495619, bsz=16670, num_updates=27900, lr=0.000378641, gnorm=0.194, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=25801 epoch 017: 1019 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=544277, ups=1.1, wpb=494469, bsz=16335.1, num_updates=28000, lr=0.000377964, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25892 epoch 017: 1019 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=544277, ups=1.1, wpb=494469, bsz=16335.1, num_updates=28000, lr=0.000377964, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25892 epoch 017: 1019 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=544277, ups=1.1, wpb=494469, bsz=16335.1, num_updates=28000, lr=0.000377964, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25892 epoch 017: 1019 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=544277, ups=1.1, wpb=494469, bsz=16335.1, num_updates=28000, lr=0.000377964, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25892 epoch 017: 1019 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=544277, ups=1.1, wpb=494469, bsz=16335.1, num_updates=28000, lr=0.000377964, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25892 epoch 017: 1019 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=544277, ups=1.1, wpb=494469, bsz=16335.1, num_updates=28000, lr=0.000377964, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25892 epoch 017: 1019 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=544277, ups=1.1, wpb=494469, bsz=16335.1, num_updates=28000, lr=0.000377964, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25892 epoch 017: 1019 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=544277, ups=1.1, wpb=494469, bsz=16335.1, num_updates=28000, lr=0.000377964, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25892 epoch 017: 1019 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=544277, ups=1.1, wpb=494469, bsz=16335.1, num_updates=28000, lr=0.000377964, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25892 epoch 017: 1019 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=544277, ups=1.1, wpb=494469, bsz=16335.1, num_updates=28000, lr=0.000377964, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25892 epoch 017: 1019 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=544277, ups=1.1, wpb=494469, bsz=16335.1, num_updates=28000, lr=0.000377964, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25892 epoch 017: 1019 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=544277, ups=1.1, wpb=494469, bsz=16335.1, num_updates=28000, lr=0.000377964, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25892 epoch 017: 1019 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=544277, ups=1.1, wpb=494469, bsz=16335.1, num_updates=28000, lr=0.000377964, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25892 epoch 017: 1019 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=544277, ups=1.1, wpb=494469, bsz=16335.1, num_updates=28000, lr=0.000377964, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25892 epoch 017: 1019 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=544277, ups=1.1, wpb=494469, bsz=16335.1, num_updates=28000, lr=0.000377964, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25892 epoch 017: 1019 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=544277, ups=1.1, wpb=494469, bsz=16335.1, num_updates=28000, lr=0.000377964, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25892 epoch 017: 1019 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=544277, ups=1.1, wpb=494469, bsz=16335.1, num_updates=28000, lr=0.000377964, gnorm=0.182, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=25892 begin validation on "valid" subset epoch 017 | valid on 'valid' subset | loss 3.743 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.743 epoch 017 | valid on 'valid' subset | loss 3.743 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.743 epoch 017 | valid on 'valid' subset | loss 3.743 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.743 epoch 017 | valid on 'valid' subset | loss 3.743 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.743 epoch 017 | valid on 'valid' subset | loss 3.743 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.743 epoch 017 | valid on 'valid' subset | loss 3.743 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.743 epoch 017 | valid on 'valid' subset | loss 3.743 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.743 epoch 017 | valid on 'valid' subset | loss 3.743 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.743 epoch 017 | valid on 'valid' subset | loss 3.743 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.743 epoch 017 | valid on 'valid' subset | loss 3.743 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.743 epoch 017 | valid on 'valid' subset | loss 3.743 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.743 epoch 017 | valid on 'valid' subset | loss 3.743 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.743 epoch 017 | valid on 'valid' subset | loss 3.743 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.743 epoch 017 | valid on 'valid' subset | loss 3.743 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.743 epoch 017 | valid on 'valid' subset | loss 3.743 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.743 epoch 017 | valid on 'valid' subset | loss 3.743 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.743 epoch 017 | valid on 'valid' subset | loss 3.743 | nll_loss 2.174 | ppl 4.51 | wps 0 | wpb 44526 | bsz 2008 | num_updates 28000 | best_loss 3.743 epoch 017: 1119 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=405912, ups=0.82, wpb=496736, bsz=16783.2, num_updates=28100, lr=0.000377291, gnorm=0.185, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=26014 epoch 017: 1119 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=405912, ups=0.82, wpb=496736, bsz=16783.2, num_updates=28100, lr=0.000377291, gnorm=0.185, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=26014 epoch 017: 1119 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=405912, ups=0.82, wpb=496736, bsz=16783.2, num_updates=28100, lr=0.000377291, gnorm=0.185, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=26014 epoch 017: 1119 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=405912, ups=0.82, wpb=496736, bsz=16783.2, num_updates=28100, lr=0.000377291, gnorm=0.185, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=26014 epoch 017: 1119 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=405912, ups=0.82, wpb=496736, bsz=16783.2, num_updates=28100, lr=0.000377291, gnorm=0.185, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=26014 epoch 017: 1119 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=405912, ups=0.82, wpb=496736, bsz=16783.2, num_updates=28100, lr=0.000377291, gnorm=0.185, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=26014 epoch 017: 1119 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=405912, ups=0.82, wpb=496736, bsz=16783.2, num_updates=28100, lr=0.000377291, gnorm=0.185, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=26014 epoch 017: 1119 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=405912, ups=0.82, wpb=496736, bsz=16783.2, num_updates=28100, lr=0.000377291, gnorm=0.185, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=26014 epoch 017: 1119 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=405912, ups=0.82, wpb=496736, bsz=16783.2, num_updates=28100, lr=0.000377291, gnorm=0.185, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=26014 epoch 017: 1119 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=405912, ups=0.82, wpb=496736, bsz=16783.2, num_updates=28100, lr=0.000377291, gnorm=0.185, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=26014 epoch 017: 1119 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=405912, ups=0.82, wpb=496736, bsz=16783.2, num_updates=28100, lr=0.000377291, gnorm=0.185, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=26014 epoch 017: 1119 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=405912, ups=0.82, wpb=496736, bsz=16783.2, num_updates=28100, lr=0.000377291, gnorm=0.185, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=26014 epoch 017: 1119 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=405912, ups=0.82, wpb=496736, bsz=16783.2, num_updates=28100, lr=0.000377291, gnorm=0.185, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=26014 epoch 017: 1119 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=405912, ups=0.82, wpb=496736, bsz=16783.2, num_updates=28100, lr=0.000377291, gnorm=0.185, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=26014 epoch 017: 1119 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=405912, ups=0.82, wpb=496736, bsz=16783.2, num_updates=28100, lr=0.000377291, gnorm=0.185, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=26014 epoch 017: 1119 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=405912, ups=0.82, wpb=496736, bsz=16783.2, num_updates=28100, lr=0.000377291, gnorm=0.185, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=26014 epoch 017: 1119 / 1689 loss=3.739, nll_loss=2.209, ppl=4.62, wps=405912, ups=0.82, wpb=496736, bsz=16783.2, num_updates=28100, lr=0.000377291, gnorm=0.185, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=26014 epoch 017: 1219 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=558440, ups=1.13, wpb=495026, bsz=16272.9, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=26103 epoch 017: 1219 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=558440, ups=1.13, wpb=495026, bsz=16272.9, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=26103 epoch 017: 1219 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=558440, ups=1.13, wpb=495026, bsz=16272.9, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=26103 epoch 017: 1219 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=558440, ups=1.13, wpb=495026, bsz=16272.9, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=26103 epoch 017: 1219 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=558440, ups=1.13, wpb=495026, bsz=16272.9, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=26103 epoch 017: 1219 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=558440, ups=1.13, wpb=495026, bsz=16272.9, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=26103 epoch 017: 1219 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=558440, ups=1.13, wpb=495026, bsz=16272.9, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=26103 epoch 017: 1219 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=558440, ups=1.13, wpb=495026, bsz=16272.9, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=26103 epoch 017: 1219 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=558440, ups=1.13, wpb=495026, bsz=16272.9, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=26103 epoch 017: 1219 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=558440, ups=1.13, wpb=495026, bsz=16272.9, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=26103 epoch 017: 1219 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=558440, ups=1.13, wpb=495026, bsz=16272.9, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=26103 epoch 017: 1219 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=558440, ups=1.13, wpb=495026, bsz=16272.9, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=26103 epoch 017: 1219 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=558440, ups=1.13, wpb=495026, bsz=16272.9, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=26103 epoch 017: 1219 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=558440, ups=1.13, wpb=495026, bsz=16272.9, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=26103 epoch 017: 1219 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=558440, ups=1.13, wpb=495026, bsz=16272.9, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=26103 epoch 017: 1219 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=558440, ups=1.13, wpb=495026, bsz=16272.9, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=26103 epoch 017: 1219 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=558440, ups=1.13, wpb=495026, bsz=16272.9, num_updates=28200, lr=0.000376622, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=26103 epoch 017: 1319 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=556140, ups=1.12, wpb=494919, bsz=16414.6, num_updates=28300, lr=0.000375956, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=26192 epoch 017: 1319 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=556140, ups=1.12, wpb=494919, bsz=16414.6, num_updates=28300, lr=0.000375956, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=26192 epoch 017: 1319 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=556140, ups=1.12, wpb=494919, bsz=16414.6, num_updates=28300, lr=0.000375956, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=26192 epoch 017: 1319 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=556140, ups=1.12, wpb=494919, bsz=16414.6, num_updates=28300, lr=0.000375956, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=26192 epoch 017: 1319 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=556140, ups=1.12, wpb=494919, bsz=16414.6, num_updates=28300, lr=0.000375956, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=26192 epoch 017: 1319 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=556140, ups=1.12, wpb=494919, bsz=16414.6, num_updates=28300, lr=0.000375956, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=26192 epoch 017: 1319 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=556140, ups=1.12, wpb=494919, bsz=16414.6, num_updates=28300, lr=0.000375956, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=26192 epoch 017: 1319 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=556140, ups=1.12, wpb=494919, bsz=16414.6, num_updates=28300, lr=0.000375956, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=26192 epoch 017: 1319 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=556140, ups=1.12, wpb=494919, bsz=16414.6, num_updates=28300, lr=0.000375956, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=26192 epoch 017: 1319 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=556140, ups=1.12, wpb=494919, bsz=16414.6, num_updates=28300, lr=0.000375956, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=26192 epoch 017: 1319 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=556140, ups=1.12, wpb=494919, bsz=16414.6, num_updates=28300, lr=0.000375956, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=26192 epoch 017: 1319 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=556140, ups=1.12, wpb=494919, bsz=16414.6, num_updates=28300, lr=0.000375956, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=26192 epoch 017: 1319 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=556140, ups=1.12, wpb=494919, bsz=16414.6, num_updates=28300, lr=0.000375956, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=26192 epoch 017: 1319 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=556140, ups=1.12, wpb=494919, bsz=16414.6, num_updates=28300, lr=0.000375956, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=26192 epoch 017: 1319 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=556140, ups=1.12, wpb=494919, bsz=16414.6, num_updates=28300, lr=0.000375956, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=26192 epoch 017: 1319 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=556140, ups=1.12, wpb=494919, bsz=16414.6, num_updates=28300, lr=0.000375956, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=26192 epoch 017: 1319 / 1689 loss=3.741, nll_loss=2.21, ppl=4.63, wps=556140, ups=1.12, wpb=494919, bsz=16414.6, num_updates=28300, lr=0.000375956, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=20.6, wall=26192 epoch 017: 1419 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=556678, ups=1.12, wpb=495686, bsz=16598.6, num_updates=28400, lr=0.000375293, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=20.1, wall=26281 epoch 017: 1419 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=556678, ups=1.12, wpb=495686, bsz=16598.6, num_updates=28400, lr=0.000375293, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=20.1, wall=26281 epoch 017: 1419 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=556678, ups=1.12, wpb=495686, bsz=16598.6, num_updates=28400, lr=0.000375293, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=20.1, wall=26281 epoch 017: 1419 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=556678, ups=1.12, wpb=495686, bsz=16598.6, num_updates=28400, lr=0.000375293, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=20.1, wall=26281 epoch 017: 1419 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=556678, ups=1.12, wpb=495686, bsz=16598.6, num_updates=28400, lr=0.000375293, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=20.1, wall=26281 epoch 017: 1419 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=556678, ups=1.12, wpb=495686, bsz=16598.6, num_updates=28400, lr=0.000375293, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=20.1, wall=26281 epoch 017: 1419 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=556678, ups=1.12, wpb=495686, bsz=16598.6, num_updates=28400, lr=0.000375293, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=20.1, wall=26281 epoch 017: 1419 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=556678, ups=1.12, wpb=495686, bsz=16598.6, num_updates=28400, lr=0.000375293, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=20.1, wall=26281 epoch 017: 1419 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=556678, ups=1.12, wpb=495686, bsz=16598.6, num_updates=28400, lr=0.000375293, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=20.1, wall=26281 epoch 017: 1419 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=556678, ups=1.12, wpb=495686, bsz=16598.6, num_updates=28400, lr=0.000375293, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=20.1, wall=26281 epoch 017: 1419 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=556678, ups=1.12, wpb=495686, bsz=16598.6, num_updates=28400, lr=0.000375293, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=20.1, wall=26281 epoch 017: 1419 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=556678, ups=1.12, wpb=495686, bsz=16598.6, num_updates=28400, lr=0.000375293, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=20.1, wall=26281 epoch 017: 1419 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=556678, ups=1.12, wpb=495686, bsz=16598.6, num_updates=28400, lr=0.000375293, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=20.1, wall=26281 epoch 017: 1419 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=556678, ups=1.12, wpb=495686, bsz=16598.6, num_updates=28400, lr=0.000375293, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=20.1, wall=26281 epoch 017: 1419 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=556678, ups=1.12, wpb=495686, bsz=16598.6, num_updates=28400, lr=0.000375293, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=20.1, wall=26281 epoch 017: 1419 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=556678, ups=1.12, wpb=495686, bsz=16598.6, num_updates=28400, lr=0.000375293, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=20.1, wall=26281 epoch 017: 1419 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=556678, ups=1.12, wpb=495686, bsz=16598.6, num_updates=28400, lr=0.000375293, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=20.1, wall=26281 epoch 017: 1519 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=551140, ups=1.11, wpb=496805, bsz=16616.4, num_updates=28500, lr=0.000374634, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=26371 epoch 017: 1519 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=551140, ups=1.11, wpb=496805, bsz=16616.4, num_updates=28500, lr=0.000374634, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=26371 epoch 017: 1519 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=551140, ups=1.11, wpb=496805, bsz=16616.4, num_updates=28500, lr=0.000374634, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=26371 epoch 017: 1519 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=551140, ups=1.11, wpb=496805, bsz=16616.4, num_updates=28500, lr=0.000374634, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=26371 epoch 017: 1519 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=551140, ups=1.11, wpb=496805, bsz=16616.4, num_updates=28500, lr=0.000374634, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=26371 epoch 017: 1519 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=551140, ups=1.11, wpb=496805, bsz=16616.4, num_updates=28500, lr=0.000374634, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=26371 epoch 017: 1519 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=551140, ups=1.11, wpb=496805, bsz=16616.4, num_updates=28500, lr=0.000374634, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=26371 epoch 017: 1519 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=551140, ups=1.11, wpb=496805, bsz=16616.4, num_updates=28500, lr=0.000374634, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=26371 epoch 017: 1519 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=551140, ups=1.11, wpb=496805, bsz=16616.4, num_updates=28500, lr=0.000374634, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=26371 epoch 017: 1519 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=551140, ups=1.11, wpb=496805, bsz=16616.4, num_updates=28500, lr=0.000374634, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=26371 epoch 017: 1519 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=551140, ups=1.11, wpb=496805, bsz=16616.4, num_updates=28500, lr=0.000374634, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=26371 epoch 017: 1519 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=551140, ups=1.11, wpb=496805, bsz=16616.4, num_updates=28500, lr=0.000374634, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=26371 epoch 017: 1519 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=551140, ups=1.11, wpb=496805, bsz=16616.4, num_updates=28500, lr=0.000374634, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=26371 epoch 017: 1519 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=551140, ups=1.11, wpb=496805, bsz=16616.4, num_updates=28500, lr=0.000374634, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=26371 epoch 017: 1519 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=551140, ups=1.11, wpb=496805, bsz=16616.4, num_updates=28500, lr=0.000374634, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=26371 epoch 017: 1519 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=551140, ups=1.11, wpb=496805, bsz=16616.4, num_updates=28500, lr=0.000374634, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=26371 epoch 017: 1519 / 1689 loss=3.74, nll_loss=2.21, ppl=4.63, wps=551140, ups=1.11, wpb=496805, bsz=16616.4, num_updates=28500, lr=0.000374634, gnorm=0.196, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=26371 epoch 017: 1619 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=549233, ups=1.11, wpb=495724, bsz=16864.8, num_updates=28600, lr=0.000373979, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26461 epoch 017: 1619 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=549233, ups=1.11, wpb=495724, bsz=16864.8, num_updates=28600, lr=0.000373979, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26461 epoch 017: 1619 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=549233, ups=1.11, wpb=495724, bsz=16864.8, num_updates=28600, lr=0.000373979, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26461 epoch 017: 1619 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=549233, ups=1.11, wpb=495724, bsz=16864.8, num_updates=28600, lr=0.000373979, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26461 epoch 017: 1619 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=549233, ups=1.11, wpb=495724, bsz=16864.8, num_updates=28600, lr=0.000373979, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26461 epoch 017: 1619 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=549233, ups=1.11, wpb=495724, bsz=16864.8, num_updates=28600, lr=0.000373979, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26461 epoch 017: 1619 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=549233, ups=1.11, wpb=495724, bsz=16864.8, num_updates=28600, lr=0.000373979, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26461 epoch 017: 1619 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=549233, ups=1.11, wpb=495724, bsz=16864.8, num_updates=28600, lr=0.000373979, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26461 epoch 017: 1619 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=549233, ups=1.11, wpb=495724, bsz=16864.8, num_updates=28600, lr=0.000373979, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26461 epoch 017: 1619 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=549233, ups=1.11, wpb=495724, bsz=16864.8, num_updates=28600, lr=0.000373979, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26461 epoch 017: 1619 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=549233, ups=1.11, wpb=495724, bsz=16864.8, num_updates=28600, lr=0.000373979, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26461 epoch 017: 1619 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=549233, ups=1.11, wpb=495724, bsz=16864.8, num_updates=28600, lr=0.000373979, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26461 epoch 017: 1619 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=549233, ups=1.11, wpb=495724, bsz=16864.8, num_updates=28600, lr=0.000373979, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26461 epoch 017: 1619 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=549233, ups=1.11, wpb=495724, bsz=16864.8, num_updates=28600, lr=0.000373979, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26461 epoch 017: 1619 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=549233, ups=1.11, wpb=495724, bsz=16864.8, num_updates=28600, lr=0.000373979, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26461 epoch 017: 1619 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=549233, ups=1.11, wpb=495724, bsz=16864.8, num_updates=28600, lr=0.000373979, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26461 epoch 017: 1619 / 1689 loss=3.746, nll_loss=2.216, ppl=4.65, wps=549233, ups=1.11, wpb=495724, bsz=16864.8, num_updates=28600, lr=0.000373979, gnorm=0.182, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=26461 end of epoch 17 (average epoch stats below) epoch 017 | loss 3.741 | nll_loss 2.21 | ppl 4.63 | wps 534292 | ups 1.08 | wpb 495111 | bsz 16508.5 | num_updates 28670 | lr 0.000373522 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1497 | gb_free 20.7 | wall 26523 epoch 017 | loss 3.741 | nll_loss 2.21 | ppl 4.63 | wps 534292 | ups 1.08 | wpb 495111 | bsz 16508.5 | num_updates 28670 | lr 0.000373522 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1497 | gb_free 20.7 | wall 26523 epoch 017 | loss 3.741 | nll_loss 2.21 | ppl 4.63 | wps 534292 | ups 1.08 | wpb 495111 | bsz 16508.5 | num_updates 28670 | lr 0.000373522 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1497 | gb_free 20.7 | wall 26523 epoch 017 | loss 3.741 | nll_loss 2.21 | ppl 4.63 | wps 534292 | ups 1.08 | wpb 495111 | bsz 16508.5 | num_updates 28670 | lr 0.000373522 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1497 | gb_free 20.7 | wall 26523 epoch 017 | loss 3.741 | nll_loss 2.21 | ppl 4.63 | wps 534292 | ups 1.08 | wpb 495111 | bsz 16508.5 | num_updates 28670 | lr 0.000373522 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1497 | gb_free 20.7 | wall 26523 epoch 017 | loss 3.741 | nll_loss 2.21 | ppl 4.63 | wps 534292 | ups 1.08 | wpb 495111 | bsz 16508.5 | num_updates 28670 | lr 0.000373522 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1497 | gb_free 20.7 | wall 26523 epoch 017 | loss 3.741 | nll_loss 2.21 | ppl 4.63 | wps 534292 | ups 1.08 | wpb 495111 | bsz 16508.5 | num_updates 28670 | lr 0.000373522 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1497 | gb_free 20.7 | wall 26523 epoch 017 | loss 3.741 | nll_loss 2.21 | ppl 4.63 | wps 534292 | ups 1.08 | wpb 495111 | bsz 16508.5 | num_updates 28670 | lr 0.000373522 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1497 | gb_free 20.7 | wall 26523 epoch 017 | loss 3.741 | nll_loss 2.21 | ppl 4.63 | wps 534292 | ups 1.08 | wpb 495111 | bsz 16508.5 | num_updates 28670 | lr 0.000373522 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1497 | gb_free 20.7 | wall 26523 epoch 017 | loss 3.741 | nll_loss 2.21 | ppl 4.63 | wps 534292 | ups 1.08 | wpb 495111 | bsz 16508.5 | num_updates 28670 | lr 0.000373522 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1497 | gb_free 20.7 | wall 26523 epoch 017 | loss 3.741 | nll_loss 2.21 | ppl 4.63 | wps 534292 | ups 1.08 | wpb 495111 | bsz 16508.5 | num_updates 28670 | lr 0.000373522 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1497 | gb_free 20.7 | wall 26523 epoch 017 | loss 3.741 | nll_loss 2.21 | ppl 4.63 | wps 534292 | ups 1.08 | wpb 495111 | bsz 16508.5 | num_updates 28670 | lr 0.000373522 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1497 | gb_free 20.7 | wall 26523 epoch 017 | loss 3.741 | nll_loss 2.21 | ppl 4.63 | wps 534292 | ups 1.08 | wpb 495111 | bsz 16508.5 | num_updates 28670 | lr 0.000373522 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1497 | gb_free 20.7 | wall 26523 epoch 017 | loss 3.741 | nll_loss 2.21 | ppl 4.63 | wps 534292 | ups 1.08 | wpb 495111 | bsz 16508.5 | num_updates 28670 | lr 0.000373522 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1497 | gb_free 20.7 | wall 26523 epoch 017 | loss 3.741 | nll_loss 2.21 | ppl 4.63 | wps 534292 | ups 1.08 | wpb 495111 | bsz 16508.5 | num_updates 28670 | lr 0.000373522 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1497 | gb_free 20.7 | wall 26523 epoch 017 | loss 3.741 | nll_loss 2.21 | ppl 4.63 | wps 534292 | ups 1.08 | wpb 495111 | bsz 16508.5 | num_updates 28670 | lr 0.000373522 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1497 | gb_free 20.7 | wall 26523 epoch 017 | loss 3.741 | nll_loss 2.21 | ppl 4.63 | wps 534292 | ups 1.08 | wpb 495111 | bsz 16508.5 | num_updates 28670 | lr 0.000373522 | gnorm 0.183 | clip 0 | loss_scale 4 | train_wall 1497 | gb_free 20.7 | wall 26523 Start iterating over samples epoch 018: 30 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=535979, ups=1.09, wpb=490216, bsz=16046.6, num_updates=28700, lr=0.000373327, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=26553 epoch 018: 30 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=535979, ups=1.09, wpb=490216, bsz=16046.6, num_updates=28700, lr=0.000373327, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=26553 epoch 018: 30 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=535979, ups=1.09, wpb=490216, bsz=16046.6, num_updates=28700, lr=0.000373327, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=26553 epoch 018: 30 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=535979, ups=1.09, wpb=490216, bsz=16046.6, num_updates=28700, lr=0.000373327, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=26553 epoch 018: 30 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=535979, ups=1.09, wpb=490216, bsz=16046.6, num_updates=28700, lr=0.000373327, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=26553 epoch 018: 30 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=535979, ups=1.09, wpb=490216, bsz=16046.6, num_updates=28700, lr=0.000373327, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=26553 epoch 018: 30 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=535979, ups=1.09, wpb=490216, bsz=16046.6, num_updates=28700, lr=0.000373327, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=26553 epoch 018: 30 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=535979, ups=1.09, wpb=490216, bsz=16046.6, num_updates=28700, lr=0.000373327, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=26553 epoch 018: 30 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=535979, ups=1.09, wpb=490216, bsz=16046.6, num_updates=28700, lr=0.000373327, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=26553 epoch 018: 30 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=535979, ups=1.09, wpb=490216, bsz=16046.6, num_updates=28700, lr=0.000373327, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=26553 epoch 018: 30 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=535979, ups=1.09, wpb=490216, bsz=16046.6, num_updates=28700, lr=0.000373327, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=26553 epoch 018: 30 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=535979, ups=1.09, wpb=490216, bsz=16046.6, num_updates=28700, lr=0.000373327, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=26553 epoch 018: 30 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=535979, ups=1.09, wpb=490216, bsz=16046.6, num_updates=28700, lr=0.000373327, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=26553 epoch 018: 30 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=535979, ups=1.09, wpb=490216, bsz=16046.6, num_updates=28700, lr=0.000373327, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=26553 epoch 018: 30 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=535979, ups=1.09, wpb=490216, bsz=16046.6, num_updates=28700, lr=0.000373327, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=26553 epoch 018: 30 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=535979, ups=1.09, wpb=490216, bsz=16046.6, num_updates=28700, lr=0.000373327, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=26553 epoch 018: 30 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=535979, ups=1.09, wpb=490216, bsz=16046.6, num_updates=28700, lr=0.000373327, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=26553 epoch 018: 30 / 1689 loss=3.739, nll_loss=2.208, ppl=4.62, wps=535979, ups=1.09, wpb=490216, bsz=16046.6, num_updates=28700, lr=0.000373327, gnorm=0.181, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=26553 epoch 018: 130 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=548924, ups=1.11, wpb=494831, bsz=16471.4, num_updates=28800, lr=0.000372678, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=26643 epoch 018: 130 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=548924, ups=1.11, wpb=494831, bsz=16471.4, num_updates=28800, lr=0.000372678, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=26643 epoch 018: 130 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=548924, ups=1.11, wpb=494831, bsz=16471.4, num_updates=28800, lr=0.000372678, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=26643 epoch 018: 130 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=548924, ups=1.11, wpb=494831, bsz=16471.4, num_updates=28800, lr=0.000372678, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=26643 epoch 018: 130 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=548924, ups=1.11, wpb=494831, bsz=16471.4, num_updates=28800, lr=0.000372678, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=26643 epoch 018: 130 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=548924, ups=1.11, wpb=494831, bsz=16471.4, num_updates=28800, lr=0.000372678, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=26643 epoch 018: 130 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=548924, ups=1.11, wpb=494831, bsz=16471.4, num_updates=28800, lr=0.000372678, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=26643 epoch 018: 130 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=548924, ups=1.11, wpb=494831, bsz=16471.4, num_updates=28800, lr=0.000372678, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=26643 epoch 018: 130 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=548924, ups=1.11, wpb=494831, bsz=16471.4, num_updates=28800, lr=0.000372678, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=26643 epoch 018: 130 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=548924, ups=1.11, wpb=494831, bsz=16471.4, num_updates=28800, lr=0.000372678, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=26643 epoch 018: 130 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=548924, ups=1.11, wpb=494831, bsz=16471.4, num_updates=28800, lr=0.000372678, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=26643 epoch 018: 130 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=548924, ups=1.11, wpb=494831, bsz=16471.4, num_updates=28800, lr=0.000372678, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=26643 epoch 018: 130 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=548924, ups=1.11, wpb=494831, bsz=16471.4, num_updates=28800, lr=0.000372678, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=26643 epoch 018: 130 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=548924, ups=1.11, wpb=494831, bsz=16471.4, num_updates=28800, lr=0.000372678, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=26643 epoch 018: 130 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=548924, ups=1.11, wpb=494831, bsz=16471.4, num_updates=28800, lr=0.000372678, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=26643 epoch 018: 130 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=548924, ups=1.11, wpb=494831, bsz=16471.4, num_updates=28800, lr=0.000372678, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=26643 epoch 018: 130 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=548924, ups=1.11, wpb=494831, bsz=16471.4, num_updates=28800, lr=0.000372678, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=26643 epoch 018: 130 / 1689 loss=3.726, nll_loss=2.193, ppl=4.57, wps=548924, ups=1.11, wpb=494831, bsz=16471.4, num_updates=28800, lr=0.000372678, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=26643 epoch 018: 230 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=550402, ups=1.11, wpb=495926, bsz=16320.2, num_updates=28900, lr=0.000372033, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=26733 epoch 018: 230 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=550402, ups=1.11, wpb=495926, bsz=16320.2, num_updates=28900, lr=0.000372033, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=26733 epoch 018: 230 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=550402, ups=1.11, wpb=495926, bsz=16320.2, num_updates=28900, lr=0.000372033, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=26733 epoch 018: 230 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=550402, ups=1.11, wpb=495926, bsz=16320.2, num_updates=28900, lr=0.000372033, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=26733 epoch 018: 230 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=550402, ups=1.11, wpb=495926, bsz=16320.2, num_updates=28900, lr=0.000372033, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=26733 epoch 018: 230 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=550402, ups=1.11, wpb=495926, bsz=16320.2, num_updates=28900, lr=0.000372033, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=26733 epoch 018: 230 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=550402, ups=1.11, wpb=495926, bsz=16320.2, num_updates=28900, lr=0.000372033, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=26733 epoch 018: 230 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=550402, ups=1.11, wpb=495926, bsz=16320.2, num_updates=28900, lr=0.000372033, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=26733 epoch 018: 230 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=550402, ups=1.11, wpb=495926, bsz=16320.2, num_updates=28900, lr=0.000372033, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=26733 epoch 018: 230 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=550402, ups=1.11, wpb=495926, bsz=16320.2, num_updates=28900, lr=0.000372033, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=26733 epoch 018: 230 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=550402, ups=1.11, wpb=495926, bsz=16320.2, num_updates=28900, lr=0.000372033, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=26733 epoch 018: 230 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=550402, ups=1.11, wpb=495926, bsz=16320.2, num_updates=28900, lr=0.000372033, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=26733 epoch 018: 230 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=550402, ups=1.11, wpb=495926, bsz=16320.2, num_updates=28900, lr=0.000372033, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=26733 epoch 018: 230 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=550402, ups=1.11, wpb=495926, bsz=16320.2, num_updates=28900, lr=0.000372033, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=26733 epoch 018: 230 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=550402, ups=1.11, wpb=495926, bsz=16320.2, num_updates=28900, lr=0.000372033, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=26733 epoch 018: 230 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=550402, ups=1.11, wpb=495926, bsz=16320.2, num_updates=28900, lr=0.000372033, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=26733 epoch 018: 230 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=550402, ups=1.11, wpb=495926, bsz=16320.2, num_updates=28900, lr=0.000372033, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=26733 epoch 018: 230 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=550402, ups=1.11, wpb=495926, bsz=16320.2, num_updates=28900, lr=0.000372033, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=26733 epoch 018: 331 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=545450, ups=1.1, wpb=496100, bsz=16629.6, num_updates=29000, lr=0.000371391, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=26824 epoch 018: 331 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=545450, ups=1.1, wpb=496100, bsz=16629.6, num_updates=29000, lr=0.000371391, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=26824 epoch 018: 331 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=545450, ups=1.1, wpb=496100, bsz=16629.6, num_updates=29000, lr=0.000371391, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=26824 epoch 018: 331 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=545450, ups=1.1, wpb=496100, bsz=16629.6, num_updates=29000, lr=0.000371391, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=26824 epoch 018: 331 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=545450, ups=1.1, wpb=496100, bsz=16629.6, num_updates=29000, lr=0.000371391, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=26824 epoch 018: 331 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=545450, ups=1.1, wpb=496100, bsz=16629.6, num_updates=29000, lr=0.000371391, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=26824 epoch 018: 331 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=545450, ups=1.1, wpb=496100, bsz=16629.6, num_updates=29000, lr=0.000371391, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=26824 epoch 018: 331 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=545450, ups=1.1, wpb=496100, bsz=16629.6, num_updates=29000, lr=0.000371391, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=26824 epoch 018: 331 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=545450, ups=1.1, wpb=496100, bsz=16629.6, num_updates=29000, lr=0.000371391, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=26824 epoch 018: 331 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=545450, ups=1.1, wpb=496100, bsz=16629.6, num_updates=29000, lr=0.000371391, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=26824 epoch 018: 331 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=545450, ups=1.1, wpb=496100, bsz=16629.6, num_updates=29000, lr=0.000371391, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=26824 epoch 018: 331 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=545450, ups=1.1, wpb=496100, bsz=16629.6, num_updates=29000, lr=0.000371391, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=26824 epoch 018: 331 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=545450, ups=1.1, wpb=496100, bsz=16629.6, num_updates=29000, lr=0.000371391, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=26824 epoch 018: 331 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=545450, ups=1.1, wpb=496100, bsz=16629.6, num_updates=29000, lr=0.000371391, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=26824 epoch 018: 331 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=545450, ups=1.1, wpb=496100, bsz=16629.6, num_updates=29000, lr=0.000371391, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=26824 epoch 018: 331 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=545450, ups=1.1, wpb=496100, bsz=16629.6, num_updates=29000, lr=0.000371391, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=26824 epoch 018: 331 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=545450, ups=1.1, wpb=496100, bsz=16629.6, num_updates=29000, lr=0.000371391, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=26824 epoch 018: 331 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=545450, ups=1.1, wpb=496100, bsz=16629.6, num_updates=29000, lr=0.000371391, gnorm=0.186, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=26824 begin validation on "valid" subset epoch 018 | valid on 'valid' subset | loss 3.747 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.743 epoch 018 | valid on 'valid' subset | loss 3.747 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.743 epoch 018 | valid on 'valid' subset | loss 3.747 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.743 epoch 018 | valid on 'valid' subset | loss 3.747 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.743 epoch 018 | valid on 'valid' subset | loss 3.747 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.743 epoch 018 | valid on 'valid' subset | loss 3.747 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.743 epoch 018 | valid on 'valid' subset | loss 3.747 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.743 epoch 018 | valid on 'valid' subset | loss 3.747 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.743 epoch 018 | valid on 'valid' subset | loss 3.747 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.743 epoch 018 | valid on 'valid' subset | loss 3.747 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.743 epoch 018 | valid on 'valid' subset | loss 3.747 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.743 epoch 018 | valid on 'valid' subset | loss 3.747 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.743 epoch 018 | valid on 'valid' subset | loss 3.747 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.743 epoch 018 | valid on 'valid' subset | loss 3.747 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.743 epoch 018 | valid on 'valid' subset | loss 3.747 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.743 epoch 018 | valid on 'valid' subset | loss 3.747 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.743 epoch 018 | valid on 'valid' subset | loss 3.747 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.743 epoch 018 | valid on 'valid' subset | loss 3.747 | nll_loss 2.179 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 29000 | best_loss 3.743 epoch 018: 431 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=486202, ups=0.98, wpb=498334, bsz=16633.6, num_updates=29100, lr=0.000370752, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=26926 epoch 018: 431 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=486202, ups=0.98, wpb=498334, bsz=16633.6, num_updates=29100, lr=0.000370752, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=26926 epoch 018: 431 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=486202, ups=0.98, wpb=498334, bsz=16633.6, num_updates=29100, lr=0.000370752, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=26926 epoch 018: 431 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=486202, ups=0.98, wpb=498334, bsz=16633.6, num_updates=29100, lr=0.000370752, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=26926 epoch 018: 431 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=486202, ups=0.98, wpb=498334, bsz=16633.6, num_updates=29100, lr=0.000370752, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=26926 epoch 018: 431 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=486202, ups=0.98, wpb=498334, bsz=16633.6, num_updates=29100, lr=0.000370752, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=26926 epoch 018: 431 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=486202, ups=0.98, wpb=498334, bsz=16633.6, num_updates=29100, lr=0.000370752, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=26926 epoch 018: 431 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=486202, ups=0.98, wpb=498334, bsz=16633.6, num_updates=29100, lr=0.000370752, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=26926 epoch 018: 431 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=486202, ups=0.98, wpb=498334, bsz=16633.6, num_updates=29100, lr=0.000370752, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=26926 epoch 018: 431 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=486202, ups=0.98, wpb=498334, bsz=16633.6, num_updates=29100, lr=0.000370752, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=26926 epoch 018: 431 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=486202, ups=0.98, wpb=498334, bsz=16633.6, num_updates=29100, lr=0.000370752, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=26926 epoch 018: 431 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=486202, ups=0.98, wpb=498334, bsz=16633.6, num_updates=29100, lr=0.000370752, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=26926 epoch 018: 431 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=486202, ups=0.98, wpb=498334, bsz=16633.6, num_updates=29100, lr=0.000370752, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=26926 epoch 018: 431 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=486202, ups=0.98, wpb=498334, bsz=16633.6, num_updates=29100, lr=0.000370752, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=26926 epoch 018: 431 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=486202, ups=0.98, wpb=498334, bsz=16633.6, num_updates=29100, lr=0.000370752, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=26926 epoch 018: 431 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=486202, ups=0.98, wpb=498334, bsz=16633.6, num_updates=29100, lr=0.000370752, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=26926 epoch 018: 431 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=486202, ups=0.98, wpb=498334, bsz=16633.6, num_updates=29100, lr=0.000370752, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=26926 epoch 018: 431 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=486202, ups=0.98, wpb=498334, bsz=16633.6, num_updates=29100, lr=0.000370752, gnorm=0.195, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=26926 epoch 018: 531 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=552607, ups=1.12, wpb=495317, bsz=16388.5, num_updates=29200, lr=0.000370117, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=27016 epoch 018: 531 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=552607, ups=1.12, wpb=495317, bsz=16388.5, num_updates=29200, lr=0.000370117, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=27016 epoch 018: 531 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=552607, ups=1.12, wpb=495317, bsz=16388.5, num_updates=29200, lr=0.000370117, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=27016 epoch 018: 531 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=552607, ups=1.12, wpb=495317, bsz=16388.5, num_updates=29200, lr=0.000370117, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=27016 epoch 018: 531 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=552607, ups=1.12, wpb=495317, bsz=16388.5, num_updates=29200, lr=0.000370117, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=27016 epoch 018: 531 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=552607, ups=1.12, wpb=495317, bsz=16388.5, num_updates=29200, lr=0.000370117, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=27016 epoch 018: 531 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=552607, ups=1.12, wpb=495317, bsz=16388.5, num_updates=29200, lr=0.000370117, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=27016 epoch 018: 531 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=552607, ups=1.12, wpb=495317, bsz=16388.5, num_updates=29200, lr=0.000370117, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=27016 epoch 018: 531 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=552607, ups=1.12, wpb=495317, bsz=16388.5, num_updates=29200, lr=0.000370117, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=27016 epoch 018: 531 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=552607, ups=1.12, wpb=495317, bsz=16388.5, num_updates=29200, lr=0.000370117, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=27016 epoch 018: 531 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=552607, ups=1.12, wpb=495317, bsz=16388.5, num_updates=29200, lr=0.000370117, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=27016 epoch 018: 531 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=552607, ups=1.12, wpb=495317, bsz=16388.5, num_updates=29200, lr=0.000370117, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=27016 epoch 018: 531 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=552607, ups=1.12, wpb=495317, bsz=16388.5, num_updates=29200, lr=0.000370117, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=27016 epoch 018: 531 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=552607, ups=1.12, wpb=495317, bsz=16388.5, num_updates=29200, lr=0.000370117, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=27016 epoch 018: 531 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=552607, ups=1.12, wpb=495317, bsz=16388.5, num_updates=29200, lr=0.000370117, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=27016 epoch 018: 531 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=552607, ups=1.12, wpb=495317, bsz=16388.5, num_updates=29200, lr=0.000370117, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=27016 epoch 018: 531 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=552607, ups=1.12, wpb=495317, bsz=16388.5, num_updates=29200, lr=0.000370117, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=27016 epoch 018: 531 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=552607, ups=1.12, wpb=495317, bsz=16388.5, num_updates=29200, lr=0.000370117, gnorm=0.182, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=27016 epoch 018: 631 / 1689 loss=3.73, nll_loss=2.198, ppl=4.59, wps=559225, ups=1.13, wpb=496435, bsz=16328, num_updates=29300, lr=0.000369484, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=21.5, wall=27105 epoch 018: 631 / 1689 loss=3.73, nll_loss=2.198, ppl=4.59, wps=559225, ups=1.13, wpb=496435, bsz=16328, num_updates=29300, lr=0.000369484, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=21.5, wall=27105 epoch 018: 631 / 1689 loss=3.73, nll_loss=2.198, ppl=4.59, wps=559225, ups=1.13, wpb=496435, bsz=16328, num_updates=29300, lr=0.000369484, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=21.5, wall=27105 epoch 018: 631 / 1689 loss=3.73, nll_loss=2.198, ppl=4.59, wps=559225, ups=1.13, wpb=496435, bsz=16328, num_updates=29300, lr=0.000369484, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=21.5, wall=27105 epoch 018: 631 / 1689 loss=3.73, nll_loss=2.198, ppl=4.59, wps=559225, ups=1.13, wpb=496435, bsz=16328, num_updates=29300, lr=0.000369484, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=21.5, wall=27105 epoch 018: 631 / 1689 loss=3.73, nll_loss=2.198, ppl=4.59, wps=559225, ups=1.13, wpb=496435, bsz=16328, num_updates=29300, lr=0.000369484, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=21.5, wall=27105 epoch 018: 631 / 1689 loss=3.73, nll_loss=2.198, ppl=4.59, wps=559225, ups=1.13, wpb=496435, bsz=16328, num_updates=29300, lr=0.000369484, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=21.5, wall=27105 epoch 018: 631 / 1689 loss=3.73, nll_loss=2.198, ppl=4.59, wps=559225, ups=1.13, wpb=496435, bsz=16328, num_updates=29300, lr=0.000369484, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=21.5, wall=27105 epoch 018: 631 / 1689 loss=3.73, nll_loss=2.198, ppl=4.59, wps=559225, ups=1.13, wpb=496435, bsz=16328, num_updates=29300, lr=0.000369484, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=21.5, wall=27105 epoch 018: 631 / 1689 loss=3.73, nll_loss=2.198, ppl=4.59, wps=559225, ups=1.13, wpb=496435, bsz=16328, num_updates=29300, lr=0.000369484, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=21.5, wall=27105 epoch 018: 631 / 1689 loss=3.73, nll_loss=2.198, ppl=4.59, wps=559225, ups=1.13, wpb=496435, bsz=16328, num_updates=29300, lr=0.000369484, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=21.5, wall=27105 epoch 018: 631 / 1689 loss=3.73, nll_loss=2.198, ppl=4.59, wps=559225, ups=1.13, wpb=496435, bsz=16328, num_updates=29300, lr=0.000369484, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=21.5, wall=27105 epoch 018: 631 / 1689 loss=3.73, nll_loss=2.198, ppl=4.59, wps=559225, ups=1.13, wpb=496435, bsz=16328, num_updates=29300, lr=0.000369484, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=21.5, wall=27105 epoch 018: 631 / 1689 loss=3.73, nll_loss=2.198, ppl=4.59, wps=559225, ups=1.13, wpb=496435, bsz=16328, num_updates=29300, lr=0.000369484, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=21.5, wall=27105 epoch 018: 631 / 1689 loss=3.73, nll_loss=2.198, ppl=4.59, wps=559225, ups=1.13, wpb=496435, bsz=16328, num_updates=29300, lr=0.000369484, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=21.5, wall=27105 epoch 018: 631 / 1689 loss=3.73, nll_loss=2.198, ppl=4.59, wps=559225, ups=1.13, wpb=496435, bsz=16328, num_updates=29300, lr=0.000369484, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=21.5, wall=27105 epoch 018: 631 / 1689 loss=3.73, nll_loss=2.198, ppl=4.59, wps=559225, ups=1.13, wpb=496435, bsz=16328, num_updates=29300, lr=0.000369484, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=21.5, wall=27105 epoch 018: 631 / 1689 loss=3.73, nll_loss=2.198, ppl=4.59, wps=559225, ups=1.13, wpb=496435, bsz=16328, num_updates=29300, lr=0.000369484, gnorm=0.192, clip=0, loss_scale=2, train_wall=87, gb_free=21.5, wall=27105 epoch 018: 731 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=556581, ups=1.12, wpb=495371, bsz=16537.9, num_updates=29400, lr=0.000368856, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=27194 epoch 018: 731 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=556581, ups=1.12, wpb=495371, bsz=16537.9, num_updates=29400, lr=0.000368856, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=27194 epoch 018: 731 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=556581, ups=1.12, wpb=495371, bsz=16537.9, num_updates=29400, lr=0.000368856, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=27194 epoch 018: 731 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=556581, ups=1.12, wpb=495371, bsz=16537.9, num_updates=29400, lr=0.000368856, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=27194 epoch 018: 731 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=556581, ups=1.12, wpb=495371, bsz=16537.9, num_updates=29400, lr=0.000368856, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=27194 epoch 018: 731 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=556581, ups=1.12, wpb=495371, bsz=16537.9, num_updates=29400, lr=0.000368856, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=27194 epoch 018: 731 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=556581, ups=1.12, wpb=495371, bsz=16537.9, num_updates=29400, lr=0.000368856, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=27194 epoch 018: 731 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=556581, ups=1.12, wpb=495371, bsz=16537.9, num_updates=29400, lr=0.000368856, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=27194 epoch 018: 731 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=556581, ups=1.12, wpb=495371, bsz=16537.9, num_updates=29400, lr=0.000368856, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=27194 epoch 018: 731 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=556581, ups=1.12, wpb=495371, bsz=16537.9, num_updates=29400, lr=0.000368856, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=27194 epoch 018: 731 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=556581, ups=1.12, wpb=495371, bsz=16537.9, num_updates=29400, lr=0.000368856, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=27194 epoch 018: 731 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=556581, ups=1.12, wpb=495371, bsz=16537.9, num_updates=29400, lr=0.000368856, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=27194 epoch 018: 731 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=556581, ups=1.12, wpb=495371, bsz=16537.9, num_updates=29400, lr=0.000368856, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=27194 epoch 018: 731 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=556581, ups=1.12, wpb=495371, bsz=16537.9, num_updates=29400, lr=0.000368856, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=27194 epoch 018: 731 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=556581, ups=1.12, wpb=495371, bsz=16537.9, num_updates=29400, lr=0.000368856, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=27194 epoch 018: 731 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=556581, ups=1.12, wpb=495371, bsz=16537.9, num_updates=29400, lr=0.000368856, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=27194 epoch 018: 731 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=556581, ups=1.12, wpb=495371, bsz=16537.9, num_updates=29400, lr=0.000368856, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=27194 epoch 018: 731 / 1689 loss=3.732, nll_loss=2.201, ppl=4.6, wps=556581, ups=1.12, wpb=495371, bsz=16537.9, num_updates=29400, lr=0.000368856, gnorm=0.187, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=27194 epoch 018: 831 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550389, ups=1.11, wpb=494816, bsz=17003.6, num_updates=29500, lr=0.00036823, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27284 epoch 018: 831 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550389, ups=1.11, wpb=494816, bsz=17003.6, num_updates=29500, lr=0.00036823, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27284 epoch 018: 831 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550389, ups=1.11, wpb=494816, bsz=17003.6, num_updates=29500, lr=0.00036823, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27284 epoch 018: 831 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550389, ups=1.11, wpb=494816, bsz=17003.6, num_updates=29500, lr=0.00036823, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27284 epoch 018: 831 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550389, ups=1.11, wpb=494816, bsz=17003.6, num_updates=29500, lr=0.00036823, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27284 epoch 018: 831 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550389, ups=1.11, wpb=494816, bsz=17003.6, num_updates=29500, lr=0.00036823, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27284 epoch 018: 831 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550389, ups=1.11, wpb=494816, bsz=17003.6, num_updates=29500, lr=0.00036823, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27284 epoch 018: 831 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550389, ups=1.11, wpb=494816, bsz=17003.6, num_updates=29500, lr=0.00036823, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27284 epoch 018: 831 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550389, ups=1.11, wpb=494816, bsz=17003.6, num_updates=29500, lr=0.00036823, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27284 epoch 018: 831 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550389, ups=1.11, wpb=494816, bsz=17003.6, num_updates=29500, lr=0.00036823, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27284 epoch 018: 831 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550389, ups=1.11, wpb=494816, bsz=17003.6, num_updates=29500, lr=0.00036823, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27284 epoch 018: 831 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550389, ups=1.11, wpb=494816, bsz=17003.6, num_updates=29500, lr=0.00036823, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27284 epoch 018: 831 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550389, ups=1.11, wpb=494816, bsz=17003.6, num_updates=29500, lr=0.00036823, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27284 epoch 018: 831 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550389, ups=1.11, wpb=494816, bsz=17003.6, num_updates=29500, lr=0.00036823, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27284 epoch 018: 831 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550389, ups=1.11, wpb=494816, bsz=17003.6, num_updates=29500, lr=0.00036823, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27284 epoch 018: 831 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550389, ups=1.11, wpb=494816, bsz=17003.6, num_updates=29500, lr=0.00036823, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27284 epoch 018: 831 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550389, ups=1.11, wpb=494816, bsz=17003.6, num_updates=29500, lr=0.00036823, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27284 epoch 018: 831 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=550389, ups=1.11, wpb=494816, bsz=17003.6, num_updates=29500, lr=0.00036823, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27284 epoch 018: 931 / 1689 loss=3.734, nll_loss=2.203, ppl=4.61, wps=552098, ups=1.11, wpb=495782, bsz=16322.7, num_updates=29600, lr=0.000367607, gnorm=0.177, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27373 epoch 018: 931 / 1689 loss=3.734, nll_loss=2.203, ppl=4.61, wps=552098, ups=1.11, wpb=495782, bsz=16322.7, num_updates=29600, lr=0.000367607, gnorm=0.177, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27373 epoch 018: 931 / 1689 loss=3.734, nll_loss=2.203, ppl=4.61, wps=552098, ups=1.11, wpb=495782, bsz=16322.7, num_updates=29600, lr=0.000367607, gnorm=0.177, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27373 epoch 018: 931 / 1689 loss=3.734, nll_loss=2.203, ppl=4.61, wps=552098, ups=1.11, wpb=495782, bsz=16322.7, num_updates=29600, lr=0.000367607, gnorm=0.177, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27373 epoch 018: 931 / 1689 loss=3.734, nll_loss=2.203, ppl=4.61, wps=552098, ups=1.11, wpb=495782, bsz=16322.7, num_updates=29600, lr=0.000367607, gnorm=0.177, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27373 epoch 018: 931 / 1689 loss=3.734, nll_loss=2.203, ppl=4.61, wps=552098, ups=1.11, wpb=495782, bsz=16322.7, num_updates=29600, lr=0.000367607, gnorm=0.177, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27373 epoch 018: 931 / 1689 loss=3.734, nll_loss=2.203, ppl=4.61, wps=552098, ups=1.11, wpb=495782, bsz=16322.7, num_updates=29600, lr=0.000367607, gnorm=0.177, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27373 epoch 018: 931 / 1689 loss=3.734, nll_loss=2.203, ppl=4.61, wps=552098, ups=1.11, wpb=495782, bsz=16322.7, num_updates=29600, lr=0.000367607, gnorm=0.177, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27373 epoch 018: 931 / 1689 loss=3.734, nll_loss=2.203, ppl=4.61, wps=552098, ups=1.11, wpb=495782, bsz=16322.7, num_updates=29600, lr=0.000367607, gnorm=0.177, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27373 epoch 018: 931 / 1689 loss=3.734, nll_loss=2.203, ppl=4.61, wps=552098, ups=1.11, wpb=495782, bsz=16322.7, num_updates=29600, lr=0.000367607, gnorm=0.177, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27373 epoch 018: 931 / 1689 loss=3.734, nll_loss=2.203, ppl=4.61, wps=552098, ups=1.11, wpb=495782, bsz=16322.7, num_updates=29600, lr=0.000367607, gnorm=0.177, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27373 epoch 018: 931 / 1689 loss=3.734, nll_loss=2.203, ppl=4.61, wps=552098, ups=1.11, wpb=495782, bsz=16322.7, num_updates=29600, lr=0.000367607, gnorm=0.177, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27373 epoch 018: 931 / 1689 loss=3.734, nll_loss=2.203, ppl=4.61, wps=552098, ups=1.11, wpb=495782, bsz=16322.7, num_updates=29600, lr=0.000367607, gnorm=0.177, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27373 epoch 018: 931 / 1689 loss=3.734, nll_loss=2.203, ppl=4.61, wps=552098, ups=1.11, wpb=495782, bsz=16322.7, num_updates=29600, lr=0.000367607, gnorm=0.177, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27373 epoch 018: 931 / 1689 loss=3.734, nll_loss=2.203, ppl=4.61, wps=552098, ups=1.11, wpb=495782, bsz=16322.7, num_updates=29600, lr=0.000367607, gnorm=0.177, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27373 epoch 018: 931 / 1689 loss=3.734, nll_loss=2.203, ppl=4.61, wps=552098, ups=1.11, wpb=495782, bsz=16322.7, num_updates=29600, lr=0.000367607, gnorm=0.177, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27373 epoch 018: 931 / 1689 loss=3.734, nll_loss=2.203, ppl=4.61, wps=552098, ups=1.11, wpb=495782, bsz=16322.7, num_updates=29600, lr=0.000367607, gnorm=0.177, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27373 epoch 018: 931 / 1689 loss=3.734, nll_loss=2.203, ppl=4.61, wps=552098, ups=1.11, wpb=495782, bsz=16322.7, num_updates=29600, lr=0.000367607, gnorm=0.177, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=27373 epoch 018: 1031 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=556208, ups=1.12, wpb=494726, bsz=16369.9, num_updates=29700, lr=0.000366988, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27462 epoch 018: 1031 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=556208, ups=1.12, wpb=494726, bsz=16369.9, num_updates=29700, lr=0.000366988, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27462 epoch 018: 1031 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=556208, ups=1.12, wpb=494726, bsz=16369.9, num_updates=29700, lr=0.000366988, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27462 epoch 018: 1031 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=556208, ups=1.12, wpb=494726, bsz=16369.9, num_updates=29700, lr=0.000366988, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27462 epoch 018: 1031 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=556208, ups=1.12, wpb=494726, bsz=16369.9, num_updates=29700, lr=0.000366988, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27462 epoch 018: 1031 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=556208, ups=1.12, wpb=494726, bsz=16369.9, num_updates=29700, lr=0.000366988, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27462 epoch 018: 1031 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=556208, ups=1.12, wpb=494726, bsz=16369.9, num_updates=29700, lr=0.000366988, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27462 epoch 018: 1031 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=556208, ups=1.12, wpb=494726, bsz=16369.9, num_updates=29700, lr=0.000366988, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27462 epoch 018: 1031 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=556208, ups=1.12, wpb=494726, bsz=16369.9, num_updates=29700, lr=0.000366988, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27462 epoch 018: 1031 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=556208, ups=1.12, wpb=494726, bsz=16369.9, num_updates=29700, lr=0.000366988, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27462 epoch 018: 1031 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=556208, ups=1.12, wpb=494726, bsz=16369.9, num_updates=29700, lr=0.000366988, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27462 epoch 018: 1031 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=556208, ups=1.12, wpb=494726, bsz=16369.9, num_updates=29700, lr=0.000366988, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27462 epoch 018: 1031 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=556208, ups=1.12, wpb=494726, bsz=16369.9, num_updates=29700, lr=0.000366988, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27462 epoch 018: 1031 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=556208, ups=1.12, wpb=494726, bsz=16369.9, num_updates=29700, lr=0.000366988, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27462 epoch 018: 1031 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=556208, ups=1.12, wpb=494726, bsz=16369.9, num_updates=29700, lr=0.000366988, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27462 epoch 018: 1031 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=556208, ups=1.12, wpb=494726, bsz=16369.9, num_updates=29700, lr=0.000366988, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27462 epoch 018: 1031 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=556208, ups=1.12, wpb=494726, bsz=16369.9, num_updates=29700, lr=0.000366988, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27462 epoch 018: 1031 / 1689 loss=3.735, nll_loss=2.203, ppl=4.61, wps=556208, ups=1.12, wpb=494726, bsz=16369.9, num_updates=29700, lr=0.000366988, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=27462 epoch 018: 1131 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=557045, ups=1.12, wpb=495639, bsz=16614.8, num_updates=29800, lr=0.000366372, gnorm=0.183, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=27551 epoch 018: 1131 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=557045, ups=1.12, wpb=495639, bsz=16614.8, num_updates=29800, lr=0.000366372, gnorm=0.183, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=27551 epoch 018: 1131 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=557045, ups=1.12, wpb=495639, bsz=16614.8, num_updates=29800, lr=0.000366372, gnorm=0.183, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=27551 epoch 018: 1131 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=557045, ups=1.12, wpb=495639, bsz=16614.8, num_updates=29800, lr=0.000366372, gnorm=0.183, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=27551 epoch 018: 1131 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=557045, ups=1.12, wpb=495639, bsz=16614.8, num_updates=29800, lr=0.000366372, gnorm=0.183, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=27551 epoch 018: 1131 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=557045, ups=1.12, wpb=495639, bsz=16614.8, num_updates=29800, lr=0.000366372, gnorm=0.183, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=27551 epoch 018: 1131 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=557045, ups=1.12, wpb=495639, bsz=16614.8, num_updates=29800, lr=0.000366372, gnorm=0.183, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=27551 epoch 018: 1131 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=557045, ups=1.12, wpb=495639, bsz=16614.8, num_updates=29800, lr=0.000366372, gnorm=0.183, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=27551 epoch 018: 1131 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=557045, ups=1.12, wpb=495639, bsz=16614.8, num_updates=29800, lr=0.000366372, gnorm=0.183, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=27551 epoch 018: 1131 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=557045, ups=1.12, wpb=495639, bsz=16614.8, num_updates=29800, lr=0.000366372, gnorm=0.183, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=27551 epoch 018: 1131 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=557045, ups=1.12, wpb=495639, bsz=16614.8, num_updates=29800, lr=0.000366372, gnorm=0.183, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=27551 epoch 018: 1131 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=557045, ups=1.12, wpb=495639, bsz=16614.8, num_updates=29800, lr=0.000366372, gnorm=0.183, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=27551 epoch 018: 1131 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=557045, ups=1.12, wpb=495639, bsz=16614.8, num_updates=29800, lr=0.000366372, gnorm=0.183, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=27551 epoch 018: 1131 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=557045, ups=1.12, wpb=495639, bsz=16614.8, num_updates=29800, lr=0.000366372, gnorm=0.183, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=27551 epoch 018: 1131 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=557045, ups=1.12, wpb=495639, bsz=16614.8, num_updates=29800, lr=0.000366372, gnorm=0.183, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=27551 epoch 018: 1131 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=557045, ups=1.12, wpb=495639, bsz=16614.8, num_updates=29800, lr=0.000366372, gnorm=0.183, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=27551 epoch 018: 1131 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=557045, ups=1.12, wpb=495639, bsz=16614.8, num_updates=29800, lr=0.000366372, gnorm=0.183, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=27551 epoch 018: 1131 / 1689 loss=3.733, nll_loss=2.202, ppl=4.6, wps=557045, ups=1.12, wpb=495639, bsz=16614.8, num_updates=29800, lr=0.000366372, gnorm=0.183, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=27551 epoch 018: 1231 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=552438, ups=1.11, wpb=495534, bsz=16670.7, num_updates=29900, lr=0.000365758, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=27641 epoch 018: 1231 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=552438, ups=1.11, wpb=495534, bsz=16670.7, num_updates=29900, lr=0.000365758, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=27641 epoch 018: 1231 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=552438, ups=1.11, wpb=495534, bsz=16670.7, num_updates=29900, lr=0.000365758, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=27641 epoch 018: 1231 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=552438, ups=1.11, wpb=495534, bsz=16670.7, num_updates=29900, lr=0.000365758, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=27641 epoch 018: 1231 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=552438, ups=1.11, wpb=495534, bsz=16670.7, num_updates=29900, lr=0.000365758, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=27641 epoch 018: 1231 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=552438, ups=1.11, wpb=495534, bsz=16670.7, num_updates=29900, lr=0.000365758, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=27641 epoch 018: 1231 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=552438, ups=1.11, wpb=495534, bsz=16670.7, num_updates=29900, lr=0.000365758, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=27641 epoch 018: 1231 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=552438, ups=1.11, wpb=495534, bsz=16670.7, num_updates=29900, lr=0.000365758, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=27641 epoch 018: 1231 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=552438, ups=1.11, wpb=495534, bsz=16670.7, num_updates=29900, lr=0.000365758, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=27641 epoch 018: 1231 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=552438, ups=1.11, wpb=495534, bsz=16670.7, num_updates=29900, lr=0.000365758, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=27641 epoch 018: 1231 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=552438, ups=1.11, wpb=495534, bsz=16670.7, num_updates=29900, lr=0.000365758, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=27641 epoch 018: 1231 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=552438, ups=1.11, wpb=495534, bsz=16670.7, num_updates=29900, lr=0.000365758, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=27641 epoch 018: 1231 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=552438, ups=1.11, wpb=495534, bsz=16670.7, num_updates=29900, lr=0.000365758, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=27641 epoch 018: 1231 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=552438, ups=1.11, wpb=495534, bsz=16670.7, num_updates=29900, lr=0.000365758, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=27641 epoch 018: 1231 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=552438, ups=1.11, wpb=495534, bsz=16670.7, num_updates=29900, lr=0.000365758, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=27641 epoch 018: 1231 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=552438, ups=1.11, wpb=495534, bsz=16670.7, num_updates=29900, lr=0.000365758, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=27641 epoch 018: 1231 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=552438, ups=1.11, wpb=495534, bsz=16670.7, num_updates=29900, lr=0.000365758, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=27641 epoch 018: 1231 / 1689 loss=3.731, nll_loss=2.199, ppl=4.59, wps=552438, ups=1.11, wpb=495534, bsz=16670.7, num_updates=29900, lr=0.000365758, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=27641 epoch 018: 1333 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=542799, ups=1.1, wpb=495564, bsz=16315.1, num_updates=30000, lr=0.000365148, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=27732 epoch 018: 1333 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=542799, ups=1.1, wpb=495564, bsz=16315.1, num_updates=30000, lr=0.000365148, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=27732 epoch 018: 1333 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=542799, ups=1.1, wpb=495564, bsz=16315.1, num_updates=30000, lr=0.000365148, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=27732 epoch 018: 1333 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=542799, ups=1.1, wpb=495564, bsz=16315.1, num_updates=30000, lr=0.000365148, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=27732 epoch 018: 1333 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=542799, ups=1.1, wpb=495564, bsz=16315.1, num_updates=30000, lr=0.000365148, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=27732 epoch 018: 1333 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=542799, ups=1.1, wpb=495564, bsz=16315.1, num_updates=30000, lr=0.000365148, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=27732 epoch 018: 1333 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=542799, ups=1.1, wpb=495564, bsz=16315.1, num_updates=30000, lr=0.000365148, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=27732 epoch 018: 1333 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=542799, ups=1.1, wpb=495564, bsz=16315.1, num_updates=30000, lr=0.000365148, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=27732 epoch 018: 1333 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=542799, ups=1.1, wpb=495564, bsz=16315.1, num_updates=30000, lr=0.000365148, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=27732 epoch 018: 1333 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=542799, ups=1.1, wpb=495564, bsz=16315.1, num_updates=30000, lr=0.000365148, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=27732 epoch 018: 1333 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=542799, ups=1.1, wpb=495564, bsz=16315.1, num_updates=30000, lr=0.000365148, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=27732 epoch 018: 1333 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=542799, ups=1.1, wpb=495564, bsz=16315.1, num_updates=30000, lr=0.000365148, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=27732 epoch 018: 1333 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=542799, ups=1.1, wpb=495564, bsz=16315.1, num_updates=30000, lr=0.000365148, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=27732 epoch 018: 1333 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=542799, ups=1.1, wpb=495564, bsz=16315.1, num_updates=30000, lr=0.000365148, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=27732 epoch 018: 1333 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=542799, ups=1.1, wpb=495564, bsz=16315.1, num_updates=30000, lr=0.000365148, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=27732 epoch 018: 1333 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=542799, ups=1.1, wpb=495564, bsz=16315.1, num_updates=30000, lr=0.000365148, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=27732 epoch 018: 1333 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=542799, ups=1.1, wpb=495564, bsz=16315.1, num_updates=30000, lr=0.000365148, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=27732 epoch 018: 1333 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=542799, ups=1.1, wpb=495564, bsz=16315.1, num_updates=30000, lr=0.000365148, gnorm=0.179, clip=0, loss_scale=2, train_wall=90, gb_free=21.5, wall=27732 begin validation on "valid" subset epoch 018 | valid on 'valid' subset | loss 3.73 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.73 epoch 018 | valid on 'valid' subset | loss 3.73 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.73 epoch 018 | valid on 'valid' subset | loss 3.73 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.73 epoch 018 | valid on 'valid' subset | loss 3.73 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.73 epoch 018 | valid on 'valid' subset | loss 3.73 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.73 epoch 018 | valid on 'valid' subset | loss 3.73 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.73 epoch 018 | valid on 'valid' subset | loss 3.73 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.73 epoch 018 | valid on 'valid' subset | loss 3.73 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.73 epoch 018 | valid on 'valid' subset | loss 3.73 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.73 epoch 018 | valid on 'valid' subset | loss 3.73 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.73 epoch 018 | valid on 'valid' subset | loss 3.73 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.73 epoch 018 | valid on 'valid' subset | loss 3.73 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.73 epoch 018 | valid on 'valid' subset | loss 3.73 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.73 epoch 018 | valid on 'valid' subset | loss 3.73 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.73 epoch 018 | valid on 'valid' subset | loss 3.73 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.73 epoch 018 | valid on 'valid' subset | loss 3.73 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.73 epoch 018 | valid on 'valid' subset | loss 3.73 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.73 epoch 018 | valid on 'valid' subset | loss 3.73 | nll_loss 2.166 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 30000 | best_loss 3.73 epoch 018: 1433 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=387387, ups=0.78, wpb=494782, bsz=16377.5, num_updates=30100, lr=0.000364541, gnorm=0.187, clip=0, loss_scale=2, train_wall=103, gb_free=21.3, wall=27860 epoch 018: 1433 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=387387, ups=0.78, wpb=494782, bsz=16377.5, num_updates=30100, lr=0.000364541, gnorm=0.187, clip=0, loss_scale=2, train_wall=103, gb_free=21.3, wall=27860 epoch 018: 1433 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=387387, ups=0.78, wpb=494782, bsz=16377.5, num_updates=30100, lr=0.000364541, gnorm=0.187, clip=0, loss_scale=2, train_wall=103, gb_free=21.3, wall=27860 epoch 018: 1433 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=387387, ups=0.78, wpb=494782, bsz=16377.5, num_updates=30100, lr=0.000364541, gnorm=0.187, clip=0, loss_scale=2, train_wall=103, gb_free=21.3, wall=27860 epoch 018: 1433 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=387387, ups=0.78, wpb=494782, bsz=16377.5, num_updates=30100, lr=0.000364541, gnorm=0.187, clip=0, loss_scale=2, train_wall=103, gb_free=21.3, wall=27860 epoch 018: 1433 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=387387, ups=0.78, wpb=494782, bsz=16377.5, num_updates=30100, lr=0.000364541, gnorm=0.187, clip=0, loss_scale=2, train_wall=103, gb_free=21.3, wall=27860 epoch 018: 1433 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=387387, ups=0.78, wpb=494782, bsz=16377.5, num_updates=30100, lr=0.000364541, gnorm=0.187, clip=0, loss_scale=2, train_wall=103, gb_free=21.3, wall=27860 epoch 018: 1433 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=387387, ups=0.78, wpb=494782, bsz=16377.5, num_updates=30100, lr=0.000364541, gnorm=0.187, clip=0, loss_scale=2, train_wall=103, gb_free=21.3, wall=27860 epoch 018: 1433 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=387387, ups=0.78, wpb=494782, bsz=16377.5, num_updates=30100, lr=0.000364541, gnorm=0.187, clip=0, loss_scale=2, train_wall=103, gb_free=21.3, wall=27860 epoch 018: 1433 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=387387, ups=0.78, wpb=494782, bsz=16377.5, num_updates=30100, lr=0.000364541, gnorm=0.187, clip=0, loss_scale=2, train_wall=103, gb_free=21.3, wall=27860 epoch 018: 1433 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=387387, ups=0.78, wpb=494782, bsz=16377.5, num_updates=30100, lr=0.000364541, gnorm=0.187, clip=0, loss_scale=2, train_wall=103, gb_free=21.3, wall=27860 epoch 018: 1433 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=387387, ups=0.78, wpb=494782, bsz=16377.5, num_updates=30100, lr=0.000364541, gnorm=0.187, clip=0, loss_scale=2, train_wall=103, gb_free=21.3, wall=27860 epoch 018: 1433 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=387387, ups=0.78, wpb=494782, bsz=16377.5, num_updates=30100, lr=0.000364541, gnorm=0.187, clip=0, loss_scale=2, train_wall=103, gb_free=21.3, wall=27860 epoch 018: 1433 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=387387, ups=0.78, wpb=494782, bsz=16377.5, num_updates=30100, lr=0.000364541, gnorm=0.187, clip=0, loss_scale=2, train_wall=103, gb_free=21.3, wall=27860 epoch 018: 1433 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=387387, ups=0.78, wpb=494782, bsz=16377.5, num_updates=30100, lr=0.000364541, gnorm=0.187, clip=0, loss_scale=2, train_wall=103, gb_free=21.3, wall=27860 epoch 018: 1433 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=387387, ups=0.78, wpb=494782, bsz=16377.5, num_updates=30100, lr=0.000364541, gnorm=0.187, clip=0, loss_scale=2, train_wall=103, gb_free=21.3, wall=27860 epoch 018: 1433 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=387387, ups=0.78, wpb=494782, bsz=16377.5, num_updates=30100, lr=0.000364541, gnorm=0.187, clip=0, loss_scale=2, train_wall=103, gb_free=21.3, wall=27860 epoch 018: 1433 / 1689 loss=3.74, nll_loss=2.209, ppl=4.62, wps=387387, ups=0.78, wpb=494782, bsz=16377.5, num_updates=30100, lr=0.000364541, gnorm=0.187, clip=0, loss_scale=2, train_wall=103, gb_free=21.3, wall=27860 epoch 018: 1533 / 1689 loss=3.738, nll_loss=2.208, ppl=4.62, wps=556077, ups=1.12, wpb=495189, bsz=16338, num_updates=30200, lr=0.000363937, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=27949 epoch 018: 1533 / 1689 loss=3.738, nll_loss=2.208, ppl=4.62, wps=556077, ups=1.12, wpb=495189, bsz=16338, num_updates=30200, lr=0.000363937, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=27949 epoch 018: 1533 / 1689 loss=3.738, nll_loss=2.208, ppl=4.62, wps=556077, ups=1.12, wpb=495189, bsz=16338, num_updates=30200, lr=0.000363937, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=27949 epoch 018: 1533 / 1689 loss=3.738, nll_loss=2.208, ppl=4.62, wps=556077, ups=1.12, wpb=495189, bsz=16338, num_updates=30200, lr=0.000363937, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=27949 epoch 018: 1533 / 1689 loss=3.738, nll_loss=2.208, ppl=4.62, wps=556077, ups=1.12, wpb=495189, bsz=16338, num_updates=30200, lr=0.000363937, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=27949 epoch 018: 1533 / 1689 loss=3.738, nll_loss=2.208, ppl=4.62, wps=556077, ups=1.12, wpb=495189, bsz=16338, num_updates=30200, lr=0.000363937, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=27949 epoch 018: 1533 / 1689 loss=3.738, nll_loss=2.208, ppl=4.62, wps=556077, ups=1.12, wpb=495189, bsz=16338, num_updates=30200, lr=0.000363937, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=27949 epoch 018: 1533 / 1689 loss=3.738, nll_loss=2.208, ppl=4.62, wps=556077, ups=1.12, wpb=495189, bsz=16338, num_updates=30200, lr=0.000363937, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=27949 epoch 018: 1533 / 1689 loss=3.738, nll_loss=2.208, ppl=4.62, wps=556077, ups=1.12, wpb=495189, bsz=16338, num_updates=30200, lr=0.000363937, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=27949 epoch 018: 1533 / 1689 loss=3.738, nll_loss=2.208, ppl=4.62, wps=556077, ups=1.12, wpb=495189, bsz=16338, num_updates=30200, lr=0.000363937, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=27949 epoch 018: 1533 / 1689 loss=3.738, nll_loss=2.208, ppl=4.62, wps=556077, ups=1.12, wpb=495189, bsz=16338, num_updates=30200, lr=0.000363937, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=27949 epoch 018: 1533 / 1689 loss=3.738, nll_loss=2.208, ppl=4.62, wps=556077, ups=1.12, wpb=495189, bsz=16338, num_updates=30200, lr=0.000363937, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=27949 epoch 018: 1533 / 1689 loss=3.738, nll_loss=2.208, ppl=4.62, wps=556077, ups=1.12, wpb=495189, bsz=16338, num_updates=30200, lr=0.000363937, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=27949 epoch 018: 1533 / 1689 loss=3.738, nll_loss=2.208, ppl=4.62, wps=556077, ups=1.12, wpb=495189, bsz=16338, num_updates=30200, lr=0.000363937, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=27949 epoch 018: 1533 / 1689 loss=3.738, nll_loss=2.208, ppl=4.62, wps=556077, ups=1.12, wpb=495189, bsz=16338, num_updates=30200, lr=0.000363937, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=27949 epoch 018: 1533 / 1689 loss=3.738, nll_loss=2.208, ppl=4.62, wps=556077, ups=1.12, wpb=495189, bsz=16338, num_updates=30200, lr=0.000363937, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=27949 epoch 018: 1533 / 1689 loss=3.738, nll_loss=2.208, ppl=4.62, wps=556077, ups=1.12, wpb=495189, bsz=16338, num_updates=30200, lr=0.000363937, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=27949 epoch 018: 1533 / 1689 loss=3.738, nll_loss=2.208, ppl=4.62, wps=556077, ups=1.12, wpb=495189, bsz=16338, num_updates=30200, lr=0.000363937, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=27949 epoch 018: 1633 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=548308, ups=1.11, wpb=492283, bsz=16791.2, num_updates=30300, lr=0.000363336, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=28039 epoch 018: 1633 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=548308, ups=1.11, wpb=492283, bsz=16791.2, num_updates=30300, lr=0.000363336, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=28039 epoch 018: 1633 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=548308, ups=1.11, wpb=492283, bsz=16791.2, num_updates=30300, lr=0.000363336, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=28039 epoch 018: 1633 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=548308, ups=1.11, wpb=492283, bsz=16791.2, num_updates=30300, lr=0.000363336, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=28039 epoch 018: 1633 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=548308, ups=1.11, wpb=492283, bsz=16791.2, num_updates=30300, lr=0.000363336, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=28039 epoch 018: 1633 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=548308, ups=1.11, wpb=492283, bsz=16791.2, num_updates=30300, lr=0.000363336, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=28039 epoch 018: 1633 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=548308, ups=1.11, wpb=492283, bsz=16791.2, num_updates=30300, lr=0.000363336, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=28039 epoch 018: 1633 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=548308, ups=1.11, wpb=492283, bsz=16791.2, num_updates=30300, lr=0.000363336, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=28039 epoch 018: 1633 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=548308, ups=1.11, wpb=492283, bsz=16791.2, num_updates=30300, lr=0.000363336, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=28039 epoch 018: 1633 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=548308, ups=1.11, wpb=492283, bsz=16791.2, num_updates=30300, lr=0.000363336, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=28039 epoch 018: 1633 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=548308, ups=1.11, wpb=492283, bsz=16791.2, num_updates=30300, lr=0.000363336, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=28039 epoch 018: 1633 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=548308, ups=1.11, wpb=492283, bsz=16791.2, num_updates=30300, lr=0.000363336, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=28039 epoch 018: 1633 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=548308, ups=1.11, wpb=492283, bsz=16791.2, num_updates=30300, lr=0.000363336, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=28039 epoch 018: 1633 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=548308, ups=1.11, wpb=492283, bsz=16791.2, num_updates=30300, lr=0.000363336, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=28039 epoch 018: 1633 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=548308, ups=1.11, wpb=492283, bsz=16791.2, num_updates=30300, lr=0.000363336, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=28039 epoch 018: 1633 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=548308, ups=1.11, wpb=492283, bsz=16791.2, num_updates=30300, lr=0.000363336, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=28039 epoch 018: 1633 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=548308, ups=1.11, wpb=492283, bsz=16791.2, num_updates=30300, lr=0.000363336, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=28039 epoch 018: 1633 / 1689 loss=3.748, nll_loss=2.219, ppl=4.66, wps=548308, ups=1.11, wpb=492283, bsz=16791.2, num_updates=30300, lr=0.000363336, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=28039 end of epoch 18 (average epoch stats below) epoch 018 | loss 3.733 | nll_loss 2.202 | ppl 4.6 | wps 533374 | ups 1.08 | wpb 495132 | bsz 16509.4 | num_updates 30356 | lr 0.000363001 | gnorm 0.182 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 22.6 | wall 28088 epoch 018 | loss 3.733 | nll_loss 2.202 | ppl 4.6 | wps 533374 | ups 1.08 | wpb 495132 | bsz 16509.4 | num_updates 30356 | lr 0.000363001 | gnorm 0.182 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 22.6 | wall 28088 epoch 018 | loss 3.733 | nll_loss 2.202 | ppl 4.6 | wps 533374 | ups 1.08 | wpb 495132 | bsz 16509.4 | num_updates 30356 | lr 0.000363001 | gnorm 0.182 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 22.6 | wall 28088 epoch 018 | loss 3.733 | nll_loss 2.202 | ppl 4.6 | wps 533374 | ups 1.08 | wpb 495132 | bsz 16509.4 | num_updates 30356 | lr 0.000363001 | gnorm 0.182 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 22.6 | wall 28088 epoch 018 | loss 3.733 | nll_loss 2.202 | ppl 4.6 | wps 533374 | ups 1.08 | wpb 495132 | bsz 16509.4 | num_updates 30356 | lr 0.000363001 | gnorm 0.182 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 22.6 | wall 28088 epoch 018 | loss 3.733 | nll_loss 2.202 | ppl 4.6 | wps 533374 | ups 1.08 | wpb 495132 | bsz 16509.4 | num_updates 30356 | lr 0.000363001 | gnorm 0.182 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 22.6 | wall 28088 epoch 018 | loss 3.733 | nll_loss 2.202 | ppl 4.6 | wps 533374 | ups 1.08 | wpb 495132 | bsz 16509.4 | num_updates 30356 | lr 0.000363001 | gnorm 0.182 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 22.6 | wall 28088 epoch 018 | loss 3.733 | nll_loss 2.202 | ppl 4.6 | wps 533374 | ups 1.08 | wpb 495132 | bsz 16509.4 | num_updates 30356 | lr 0.000363001 | gnorm 0.182 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 22.6 | wall 28088 epoch 018 | loss 3.733 | nll_loss 2.202 | ppl 4.6 | wps 533374 | ups 1.08 | wpb 495132 | bsz 16509.4 | num_updates 30356 | lr 0.000363001 | gnorm 0.182 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 22.6 | wall 28088 epoch 018 | loss 3.733 | nll_loss 2.202 | ppl 4.6 | wps 533374 | ups 1.08 | wpb 495132 | bsz 16509.4 | num_updates 30356 | lr 0.000363001 | gnorm 0.182 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 22.6 | wall 28088 epoch 018 | loss 3.733 | nll_loss 2.202 | ppl 4.6 | wps 533374 | ups 1.08 | wpb 495132 | bsz 16509.4 | num_updates 30356 | lr 0.000363001 | gnorm 0.182 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 22.6 | wall 28088 epoch 018 | loss 3.733 | nll_loss 2.202 | ppl 4.6 | wps 533374 | ups 1.08 | wpb 495132 | bsz 16509.4 | num_updates 30356 | lr 0.000363001 | gnorm 0.182 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 22.6 | wall 28088 epoch 018 | loss 3.733 | nll_loss 2.202 | ppl 4.6 | wps 533374 | ups 1.08 | wpb 495132 | bsz 16509.4 | num_updates 30356 | lr 0.000363001 | gnorm 0.182 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 22.6 | wall 28088 epoch 018 | loss 3.733 | nll_loss 2.202 | ppl 4.6 | wps 533374 | ups 1.08 | wpb 495132 | bsz 16509.4 | num_updates 30356 | lr 0.000363001 | gnorm 0.182 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 22.6 | wall 28088 epoch 018 | loss 3.733 | nll_loss 2.202 | ppl 4.6 | wps 533374 | ups 1.08 | wpb 495132 | bsz 16509.4 | num_updates 30356 | lr 0.000363001 | gnorm 0.182 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 22.6 | wall 28088 epoch 018 | loss 3.733 | nll_loss 2.202 | ppl 4.6 | wps 533374 | ups 1.08 | wpb 495132 | bsz 16509.4 | num_updates 30356 | lr 0.000363001 | gnorm 0.182 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 22.6 | wall 28088 epoch 018 | loss 3.733 | nll_loss 2.202 | ppl 4.6 | wps 533374 | ups 1.08 | wpb 495132 | bsz 16509.4 | num_updates 30356 | lr 0.000363001 | gnorm 0.182 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 22.6 | wall 28088 epoch 018 | loss 3.733 | nll_loss 2.202 | ppl 4.6 | wps 533374 | ups 1.08 | wpb 495132 | bsz 16509.4 | num_updates 30356 | lr 0.000363001 | gnorm 0.182 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 22.6 | wall 28088 Start iterating over samples epoch 019: 44 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=547381, ups=1.11, wpb=491706, bsz=16593.7, num_updates=30400, lr=0.000362738, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=28129 epoch 019: 44 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=547381, ups=1.11, wpb=491706, bsz=16593.7, num_updates=30400, lr=0.000362738, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=28129 epoch 019: 44 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=547381, ups=1.11, wpb=491706, bsz=16593.7, num_updates=30400, lr=0.000362738, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=28129 epoch 019: 44 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=547381, ups=1.11, wpb=491706, bsz=16593.7, num_updates=30400, lr=0.000362738, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=28129 epoch 019: 44 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=547381, ups=1.11, wpb=491706, bsz=16593.7, num_updates=30400, lr=0.000362738, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=28129 epoch 019: 44 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=547381, ups=1.11, wpb=491706, bsz=16593.7, num_updates=30400, lr=0.000362738, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=28129 epoch 019: 44 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=547381, ups=1.11, wpb=491706, bsz=16593.7, num_updates=30400, lr=0.000362738, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=28129 epoch 019: 44 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=547381, ups=1.11, wpb=491706, bsz=16593.7, num_updates=30400, lr=0.000362738, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=28129 epoch 019: 44 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=547381, ups=1.11, wpb=491706, bsz=16593.7, num_updates=30400, lr=0.000362738, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=28129 epoch 019: 44 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=547381, ups=1.11, wpb=491706, bsz=16593.7, num_updates=30400, lr=0.000362738, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=28129 epoch 019: 44 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=547381, ups=1.11, wpb=491706, bsz=16593.7, num_updates=30400, lr=0.000362738, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=28129 epoch 019: 44 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=547381, ups=1.11, wpb=491706, bsz=16593.7, num_updates=30400, lr=0.000362738, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=28129 epoch 019: 44 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=547381, ups=1.11, wpb=491706, bsz=16593.7, num_updates=30400, lr=0.000362738, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=28129 epoch 019: 44 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=547381, ups=1.11, wpb=491706, bsz=16593.7, num_updates=30400, lr=0.000362738, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=28129 epoch 019: 44 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=547381, ups=1.11, wpb=491706, bsz=16593.7, num_updates=30400, lr=0.000362738, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=28129 epoch 019: 44 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=547381, ups=1.11, wpb=491706, bsz=16593.7, num_updates=30400, lr=0.000362738, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=28129 epoch 019: 44 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=547381, ups=1.11, wpb=491706, bsz=16593.7, num_updates=30400, lr=0.000362738, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=28129 epoch 019: 44 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=547381, ups=1.11, wpb=491706, bsz=16593.7, num_updates=30400, lr=0.000362738, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=28129 epoch 019: 44 / 1689 loss=3.731, nll_loss=2.2, ppl=4.59, wps=547381, ups=1.11, wpb=491706, bsz=16593.7, num_updates=30400, lr=0.000362738, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=28129 epoch 019: 144 / 1689 loss=3.714, nll_loss=2.18, ppl=4.53, wps=547948, ups=1.1, wpb=496635, bsz=16862, num_updates=30500, lr=0.000362143, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=28219 epoch 019: 144 / 1689 loss=3.714, nll_loss=2.18, ppl=4.53, wps=547948, ups=1.1, wpb=496635, bsz=16862, num_updates=30500, lr=0.000362143, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=28219 epoch 019: 144 / 1689 loss=3.714, nll_loss=2.18, ppl=4.53, wps=547948, ups=1.1, wpb=496635, bsz=16862, num_updates=30500, lr=0.000362143, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=28219 epoch 019: 144 / 1689 loss=3.714, nll_loss=2.18, ppl=4.53, wps=547948, ups=1.1, wpb=496635, bsz=16862, num_updates=30500, lr=0.000362143, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=28219 epoch 019: 144 / 1689 loss=3.714, nll_loss=2.18, ppl=4.53, wps=547948, ups=1.1, wpb=496635, bsz=16862, num_updates=30500, lr=0.000362143, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=28219 epoch 019: 144 / 1689 loss=3.714, nll_loss=2.18, ppl=4.53, wps=547948, ups=1.1, wpb=496635, bsz=16862, num_updates=30500, lr=0.000362143, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=28219 epoch 019: 144 / 1689 loss=3.714, nll_loss=2.18, ppl=4.53, wps=547948, ups=1.1, wpb=496635, bsz=16862, num_updates=30500, lr=0.000362143, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=28219 epoch 019: 144 / 1689 loss=3.714, nll_loss=2.18, ppl=4.53, wps=547948, ups=1.1, wpb=496635, bsz=16862, num_updates=30500, lr=0.000362143, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=28219 epoch 019: 144 / 1689 loss=3.714, nll_loss=2.18, ppl=4.53, wps=547948, ups=1.1, wpb=496635, bsz=16862, num_updates=30500, lr=0.000362143, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=28219 epoch 019: 144 / 1689 loss=3.714, nll_loss=2.18, ppl=4.53, wps=547948, ups=1.1, wpb=496635, bsz=16862, num_updates=30500, lr=0.000362143, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=28219 epoch 019: 144 / 1689 loss=3.714, nll_loss=2.18, ppl=4.53, wps=547948, ups=1.1, wpb=496635, bsz=16862, num_updates=30500, lr=0.000362143, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=28219 epoch 019: 144 / 1689 loss=3.714, nll_loss=2.18, ppl=4.53, wps=547948, ups=1.1, wpb=496635, bsz=16862, num_updates=30500, lr=0.000362143, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=28219 epoch 019: 144 / 1689 loss=3.714, nll_loss=2.18, ppl=4.53, wps=547948, ups=1.1, wpb=496635, bsz=16862, num_updates=30500, lr=0.000362143, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=28219 epoch 019: 144 / 1689 loss=3.714, nll_loss=2.18, ppl=4.53, wps=547948, ups=1.1, wpb=496635, bsz=16862, num_updates=30500, lr=0.000362143, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=28219 epoch 019: 144 / 1689 loss=3.714, nll_loss=2.18, ppl=4.53, wps=547948, ups=1.1, wpb=496635, bsz=16862, num_updates=30500, lr=0.000362143, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=28219 epoch 019: 144 / 1689 loss=3.714, nll_loss=2.18, ppl=4.53, wps=547948, ups=1.1, wpb=496635, bsz=16862, num_updates=30500, lr=0.000362143, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=28219 epoch 019: 144 / 1689 loss=3.714, nll_loss=2.18, ppl=4.53, wps=547948, ups=1.1, wpb=496635, bsz=16862, num_updates=30500, lr=0.000362143, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=28219 epoch 019: 144 / 1689 loss=3.714, nll_loss=2.18, ppl=4.53, wps=547948, ups=1.1, wpb=496635, bsz=16862, num_updates=30500, lr=0.000362143, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=28219 epoch 019: 144 / 1689 loss=3.714, nll_loss=2.18, ppl=4.53, wps=547948, ups=1.1, wpb=496635, bsz=16862, num_updates=30500, lr=0.000362143, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=28219 epoch 019: 245 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=546522, ups=1.1, wpb=496713, bsz=16532.8, num_updates=30600, lr=0.000361551, gnorm=0.19, clip=0, loss_scale=2, train_wall=90, gb_free=22.5, wall=28310 epoch 019: 245 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=546522, ups=1.1, wpb=496713, bsz=16532.8, num_updates=30600, lr=0.000361551, gnorm=0.19, clip=0, loss_scale=2, train_wall=90, gb_free=22.5, wall=28310 epoch 019: 245 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=546522, ups=1.1, wpb=496713, bsz=16532.8, num_updates=30600, lr=0.000361551, gnorm=0.19, clip=0, loss_scale=2, train_wall=90, gb_free=22.5, wall=28310 epoch 019: 245 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=546522, ups=1.1, wpb=496713, bsz=16532.8, num_updates=30600, lr=0.000361551, gnorm=0.19, clip=0, loss_scale=2, train_wall=90, gb_free=22.5, wall=28310 epoch 019: 245 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=546522, ups=1.1, wpb=496713, bsz=16532.8, num_updates=30600, lr=0.000361551, gnorm=0.19, clip=0, loss_scale=2, train_wall=90, gb_free=22.5, wall=28310 epoch 019: 245 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=546522, ups=1.1, wpb=496713, bsz=16532.8, num_updates=30600, lr=0.000361551, gnorm=0.19, clip=0, loss_scale=2, train_wall=90, gb_free=22.5, wall=28310 epoch 019: 245 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=546522, ups=1.1, wpb=496713, bsz=16532.8, num_updates=30600, lr=0.000361551, gnorm=0.19, clip=0, loss_scale=2, train_wall=90, gb_free=22.5, wall=28310 epoch 019: 245 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=546522, ups=1.1, wpb=496713, bsz=16532.8, num_updates=30600, lr=0.000361551, gnorm=0.19, clip=0, loss_scale=2, train_wall=90, gb_free=22.5, wall=28310 epoch 019: 245 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=546522, ups=1.1, wpb=496713, bsz=16532.8, num_updates=30600, lr=0.000361551, gnorm=0.19, clip=0, loss_scale=2, train_wall=90, gb_free=22.5, wall=28310 epoch 019: 245 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=546522, ups=1.1, wpb=496713, bsz=16532.8, num_updates=30600, lr=0.000361551, gnorm=0.19, clip=0, loss_scale=2, train_wall=90, gb_free=22.5, wall=28310 epoch 019: 245 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=546522, ups=1.1, wpb=496713, bsz=16532.8, num_updates=30600, lr=0.000361551, gnorm=0.19, clip=0, loss_scale=2, train_wall=90, gb_free=22.5, wall=28310 epoch 019: 245 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=546522, ups=1.1, wpb=496713, bsz=16532.8, num_updates=30600, lr=0.000361551, gnorm=0.19, clip=0, loss_scale=2, train_wall=90, gb_free=22.5, wall=28310 epoch 019: 245 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=546522, ups=1.1, wpb=496713, bsz=16532.8, num_updates=30600, lr=0.000361551, gnorm=0.19, clip=0, loss_scale=2, train_wall=90, gb_free=22.5, wall=28310 epoch 019: 245 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=546522, ups=1.1, wpb=496713, bsz=16532.8, num_updates=30600, lr=0.000361551, gnorm=0.19, clip=0, loss_scale=2, train_wall=90, gb_free=22.5, wall=28310 epoch 019: 245 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=546522, ups=1.1, wpb=496713, bsz=16532.8, num_updates=30600, lr=0.000361551, gnorm=0.19, clip=0, loss_scale=2, train_wall=90, gb_free=22.5, wall=28310 epoch 019: 245 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=546522, ups=1.1, wpb=496713, bsz=16532.8, num_updates=30600, lr=0.000361551, gnorm=0.19, clip=0, loss_scale=2, train_wall=90, gb_free=22.5, wall=28310 epoch 019: 245 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=546522, ups=1.1, wpb=496713, bsz=16532.8, num_updates=30600, lr=0.000361551, gnorm=0.19, clip=0, loss_scale=2, train_wall=90, gb_free=22.5, wall=28310 epoch 019: 245 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=546522, ups=1.1, wpb=496713, bsz=16532.8, num_updates=30600, lr=0.000361551, gnorm=0.19, clip=0, loss_scale=2, train_wall=90, gb_free=22.5, wall=28310 epoch 019: 245 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=546522, ups=1.1, wpb=496713, bsz=16532.8, num_updates=30600, lr=0.000361551, gnorm=0.19, clip=0, loss_scale=2, train_wall=90, gb_free=22.5, wall=28310 epoch 019: 345 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=550653, ups=1.11, wpb=495694, bsz=16667.9, num_updates=30700, lr=0.000360961, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=28400 epoch 019: 345 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=550653, ups=1.11, wpb=495694, bsz=16667.9, num_updates=30700, lr=0.000360961, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=28400 epoch 019: 345 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=550653, ups=1.11, wpb=495694, bsz=16667.9, num_updates=30700, lr=0.000360961, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=28400 epoch 019: 345 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=550653, ups=1.11, wpb=495694, bsz=16667.9, num_updates=30700, lr=0.000360961, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=28400 epoch 019: 345 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=550653, ups=1.11, wpb=495694, bsz=16667.9, num_updates=30700, lr=0.000360961, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=28400 epoch 019: 345 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=550653, ups=1.11, wpb=495694, bsz=16667.9, num_updates=30700, lr=0.000360961, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=28400 epoch 019: 345 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=550653, ups=1.11, wpb=495694, bsz=16667.9, num_updates=30700, lr=0.000360961, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=28400 epoch 019: 345 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=550653, ups=1.11, wpb=495694, bsz=16667.9, num_updates=30700, lr=0.000360961, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=28400 epoch 019: 345 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=550653, ups=1.11, wpb=495694, bsz=16667.9, num_updates=30700, lr=0.000360961, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=28400 epoch 019: 345 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=550653, ups=1.11, wpb=495694, bsz=16667.9, num_updates=30700, lr=0.000360961, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=28400 epoch 019: 345 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=550653, ups=1.11, wpb=495694, bsz=16667.9, num_updates=30700, lr=0.000360961, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=28400 epoch 019: 345 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=550653, ups=1.11, wpb=495694, bsz=16667.9, num_updates=30700, lr=0.000360961, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=28400 epoch 019: 345 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=550653, ups=1.11, wpb=495694, bsz=16667.9, num_updates=30700, lr=0.000360961, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=28400 epoch 019: 345 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=550653, ups=1.11, wpb=495694, bsz=16667.9, num_updates=30700, lr=0.000360961, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=28400 epoch 019: 345 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=550653, ups=1.11, wpb=495694, bsz=16667.9, num_updates=30700, lr=0.000360961, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=28400 epoch 019: 345 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=550653, ups=1.11, wpb=495694, bsz=16667.9, num_updates=30700, lr=0.000360961, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=28400 epoch 019: 345 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=550653, ups=1.11, wpb=495694, bsz=16667.9, num_updates=30700, lr=0.000360961, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=28400 epoch 019: 345 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=550653, ups=1.11, wpb=495694, bsz=16667.9, num_updates=30700, lr=0.000360961, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=28400 epoch 019: 345 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=550653, ups=1.11, wpb=495694, bsz=16667.9, num_updates=30700, lr=0.000360961, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=28400 epoch 019: 445 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551947, ups=1.11, wpb=495982, bsz=16107.2, num_updates=30800, lr=0.000360375, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28490 epoch 019: 445 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551947, ups=1.11, wpb=495982, bsz=16107.2, num_updates=30800, lr=0.000360375, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28490 epoch 019: 445 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551947, ups=1.11, wpb=495982, bsz=16107.2, num_updates=30800, lr=0.000360375, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28490 epoch 019: 445 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551947, ups=1.11, wpb=495982, bsz=16107.2, num_updates=30800, lr=0.000360375, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28490 epoch 019: 445 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551947, ups=1.11, wpb=495982, bsz=16107.2, num_updates=30800, lr=0.000360375, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28490 epoch 019: 445 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551947, ups=1.11, wpb=495982, bsz=16107.2, num_updates=30800, lr=0.000360375, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28490 epoch 019: 445 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551947, ups=1.11, wpb=495982, bsz=16107.2, num_updates=30800, lr=0.000360375, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28490 epoch 019: 445 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551947, ups=1.11, wpb=495982, bsz=16107.2, num_updates=30800, lr=0.000360375, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28490 epoch 019: 445 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551947, ups=1.11, wpb=495982, bsz=16107.2, num_updates=30800, lr=0.000360375, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28490 epoch 019: 445 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551947, ups=1.11, wpb=495982, bsz=16107.2, num_updates=30800, lr=0.000360375, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28490 epoch 019: 445 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551947, ups=1.11, wpb=495982, bsz=16107.2, num_updates=30800, lr=0.000360375, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28490 epoch 019: 445 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551947, ups=1.11, wpb=495982, bsz=16107.2, num_updates=30800, lr=0.000360375, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28490 epoch 019: 445 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551947, ups=1.11, wpb=495982, bsz=16107.2, num_updates=30800, lr=0.000360375, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28490 epoch 019: 445 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551947, ups=1.11, wpb=495982, bsz=16107.2, num_updates=30800, lr=0.000360375, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28490 epoch 019: 445 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551947, ups=1.11, wpb=495982, bsz=16107.2, num_updates=30800, lr=0.000360375, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28490 epoch 019: 445 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551947, ups=1.11, wpb=495982, bsz=16107.2, num_updates=30800, lr=0.000360375, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28490 epoch 019: 445 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551947, ups=1.11, wpb=495982, bsz=16107.2, num_updates=30800, lr=0.000360375, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28490 epoch 019: 445 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551947, ups=1.11, wpb=495982, bsz=16107.2, num_updates=30800, lr=0.000360375, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28490 epoch 019: 445 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551947, ups=1.11, wpb=495982, bsz=16107.2, num_updates=30800, lr=0.000360375, gnorm=0.175, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=28490 epoch 019: 545 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=554682, ups=1.12, wpb=496574, bsz=16587.4, num_updates=30900, lr=0.000359791, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=28580 epoch 019: 545 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=554682, ups=1.12, wpb=496574, bsz=16587.4, num_updates=30900, lr=0.000359791, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=28580 epoch 019: 545 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=554682, ups=1.12, wpb=496574, bsz=16587.4, num_updates=30900, lr=0.000359791, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=28580 epoch 019: 545 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=554682, ups=1.12, wpb=496574, bsz=16587.4, num_updates=30900, lr=0.000359791, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=28580 epoch 019: 545 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=554682, ups=1.12, wpb=496574, bsz=16587.4, num_updates=30900, lr=0.000359791, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=28580 epoch 019: 545 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=554682, ups=1.12, wpb=496574, bsz=16587.4, num_updates=30900, lr=0.000359791, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=28580 epoch 019: 545 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=554682, ups=1.12, wpb=496574, bsz=16587.4, num_updates=30900, lr=0.000359791, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=28580 epoch 019: 545 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=554682, ups=1.12, wpb=496574, bsz=16587.4, num_updates=30900, lr=0.000359791, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=28580 epoch 019: 545 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=554682, ups=1.12, wpb=496574, bsz=16587.4, num_updates=30900, lr=0.000359791, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=28580 epoch 019: 545 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=554682, ups=1.12, wpb=496574, bsz=16587.4, num_updates=30900, lr=0.000359791, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=28580 epoch 019: 545 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=554682, ups=1.12, wpb=496574, bsz=16587.4, num_updates=30900, lr=0.000359791, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=28580 epoch 019: 545 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=554682, ups=1.12, wpb=496574, bsz=16587.4, num_updates=30900, lr=0.000359791, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=28580 epoch 019: 545 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=554682, ups=1.12, wpb=496574, bsz=16587.4, num_updates=30900, lr=0.000359791, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=28580 epoch 019: 545 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=554682, ups=1.12, wpb=496574, bsz=16587.4, num_updates=30900, lr=0.000359791, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=28580 epoch 019: 545 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=554682, ups=1.12, wpb=496574, bsz=16587.4, num_updates=30900, lr=0.000359791, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=28580 epoch 019: 545 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=554682, ups=1.12, wpb=496574, bsz=16587.4, num_updates=30900, lr=0.000359791, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=28580 epoch 019: 545 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=554682, ups=1.12, wpb=496574, bsz=16587.4, num_updates=30900, lr=0.000359791, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=28580 epoch 019: 545 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=554682, ups=1.12, wpb=496574, bsz=16587.4, num_updates=30900, lr=0.000359791, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=28580 epoch 019: 545 / 1689 loss=3.729, nll_loss=2.197, ppl=4.59, wps=554682, ups=1.12, wpb=496574, bsz=16587.4, num_updates=30900, lr=0.000359791, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=28580 epoch 019: 645 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=550018, ups=1.11, wpb=495725, bsz=16849.9, num_updates=31000, lr=0.000359211, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=28670 epoch 019: 645 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=550018, ups=1.11, wpb=495725, bsz=16849.9, num_updates=31000, lr=0.000359211, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=28670 epoch 019: 645 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=550018, ups=1.11, wpb=495725, bsz=16849.9, num_updates=31000, lr=0.000359211, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=28670 epoch 019: 645 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=550018, ups=1.11, wpb=495725, bsz=16849.9, num_updates=31000, lr=0.000359211, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=28670 epoch 019: 645 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=550018, ups=1.11, wpb=495725, bsz=16849.9, num_updates=31000, lr=0.000359211, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=28670 epoch 019: 645 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=550018, ups=1.11, wpb=495725, bsz=16849.9, num_updates=31000, lr=0.000359211, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=28670 epoch 019: 645 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=550018, ups=1.11, wpb=495725, bsz=16849.9, num_updates=31000, lr=0.000359211, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=28670 epoch 019: 645 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=550018, ups=1.11, wpb=495725, bsz=16849.9, num_updates=31000, lr=0.000359211, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=28670 epoch 019: 645 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=550018, ups=1.11, wpb=495725, bsz=16849.9, num_updates=31000, lr=0.000359211, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=28670 epoch 019: 645 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=550018, ups=1.11, wpb=495725, bsz=16849.9, num_updates=31000, lr=0.000359211, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=28670 epoch 019: 645 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=550018, ups=1.11, wpb=495725, bsz=16849.9, num_updates=31000, lr=0.000359211, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=28670 epoch 019: 645 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=550018, ups=1.11, wpb=495725, bsz=16849.9, num_updates=31000, lr=0.000359211, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=28670 epoch 019: 645 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=550018, ups=1.11, wpb=495725, bsz=16849.9, num_updates=31000, lr=0.000359211, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=28670 epoch 019: 645 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=550018, ups=1.11, wpb=495725, bsz=16849.9, num_updates=31000, lr=0.000359211, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=28670 epoch 019: 645 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=550018, ups=1.11, wpb=495725, bsz=16849.9, num_updates=31000, lr=0.000359211, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=28670 epoch 019: 645 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=550018, ups=1.11, wpb=495725, bsz=16849.9, num_updates=31000, lr=0.000359211, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=28670 epoch 019: 645 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=550018, ups=1.11, wpb=495725, bsz=16849.9, num_updates=31000, lr=0.000359211, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=28670 epoch 019: 645 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=550018, ups=1.11, wpb=495725, bsz=16849.9, num_updates=31000, lr=0.000359211, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=28670 epoch 019: 645 / 1689 loss=3.728, nll_loss=2.196, ppl=4.58, wps=550018, ups=1.11, wpb=495725, bsz=16849.9, num_updates=31000, lr=0.000359211, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=28670 begin validation on "valid" subset epoch 019 | valid on 'valid' subset | loss 3.746 | nll_loss 2.181 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.746 | nll_loss 2.181 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.746 | nll_loss 2.181 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.746 | nll_loss 2.181 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.746 | nll_loss 2.181 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.746 | nll_loss 2.181 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.746 | nll_loss 2.181 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.746 | nll_loss 2.181 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.746 | nll_loss 2.181 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.746 | nll_loss 2.181 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.746 | nll_loss 2.181 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.746 | nll_loss 2.181 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.746 | nll_loss 2.181 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.746 | nll_loss 2.181 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.746 | nll_loss 2.181 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.746 | nll_loss 2.181 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.746 | nll_loss 2.181 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.746 | nll_loss 2.181 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.746 | nll_loss 2.181 | ppl 4.53 | wps 0 | wpb 44526 | bsz 2008 | num_updates 31000 | best_loss 3.73 epoch 019: 745 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=376286, ups=0.76, wpb=495182, bsz=16708.2, num_updates=31100, lr=0.000358633, gnorm=0.174, clip=0, loss_scale=4, train_wall=92, gb_free=22.3, wall=28801 epoch 019: 745 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=376286, ups=0.76, wpb=495182, bsz=16708.2, num_updates=31100, lr=0.000358633, gnorm=0.174, clip=0, loss_scale=4, train_wall=92, gb_free=22.3, wall=28801 epoch 019: 745 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=376286, ups=0.76, wpb=495182, bsz=16708.2, num_updates=31100, lr=0.000358633, gnorm=0.174, clip=0, loss_scale=4, train_wall=92, gb_free=22.3, wall=28801 epoch 019: 745 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=376286, ups=0.76, wpb=495182, bsz=16708.2, num_updates=31100, lr=0.000358633, gnorm=0.174, clip=0, loss_scale=4, train_wall=92, gb_free=22.3, wall=28801 epoch 019: 745 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=376286, ups=0.76, wpb=495182, bsz=16708.2, num_updates=31100, lr=0.000358633, gnorm=0.174, clip=0, loss_scale=4, train_wall=92, gb_free=22.3, wall=28801 epoch 019: 745 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=376286, ups=0.76, wpb=495182, bsz=16708.2, num_updates=31100, lr=0.000358633, gnorm=0.174, clip=0, loss_scale=4, train_wall=92, gb_free=22.3, wall=28801 epoch 019: 745 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=376286, ups=0.76, wpb=495182, bsz=16708.2, num_updates=31100, lr=0.000358633, gnorm=0.174, clip=0, loss_scale=4, train_wall=92, gb_free=22.3, wall=28801 epoch 019: 745 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=376286, ups=0.76, wpb=495182, bsz=16708.2, num_updates=31100, lr=0.000358633, gnorm=0.174, clip=0, loss_scale=4, train_wall=92, gb_free=22.3, wall=28801 epoch 019: 745 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=376286, ups=0.76, wpb=495182, bsz=16708.2, num_updates=31100, lr=0.000358633, gnorm=0.174, clip=0, loss_scale=4, train_wall=92, gb_free=22.3, wall=28801 epoch 019: 745 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=376286, ups=0.76, wpb=495182, bsz=16708.2, num_updates=31100, lr=0.000358633, gnorm=0.174, clip=0, loss_scale=4, train_wall=92, gb_free=22.3, wall=28801 epoch 019: 745 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=376286, ups=0.76, wpb=495182, bsz=16708.2, num_updates=31100, lr=0.000358633, gnorm=0.174, clip=0, loss_scale=4, train_wall=92, gb_free=22.3, wall=28801 epoch 019: 745 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=376286, ups=0.76, wpb=495182, bsz=16708.2, num_updates=31100, lr=0.000358633, gnorm=0.174, clip=0, loss_scale=4, train_wall=92, gb_free=22.3, wall=28801 epoch 019: 745 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=376286, ups=0.76, wpb=495182, bsz=16708.2, num_updates=31100, lr=0.000358633, gnorm=0.174, clip=0, loss_scale=4, train_wall=92, gb_free=22.3, wall=28801 epoch 019: 745 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=376286, ups=0.76, wpb=495182, bsz=16708.2, num_updates=31100, lr=0.000358633, gnorm=0.174, clip=0, loss_scale=4, train_wall=92, gb_free=22.3, wall=28801 epoch 019: 745 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=376286, ups=0.76, wpb=495182, bsz=16708.2, num_updates=31100, lr=0.000358633, gnorm=0.174, clip=0, loss_scale=4, train_wall=92, gb_free=22.3, wall=28801 epoch 019: 745 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=376286, ups=0.76, wpb=495182, bsz=16708.2, num_updates=31100, lr=0.000358633, gnorm=0.174, clip=0, loss_scale=4, train_wall=92, gb_free=22.3, wall=28801 epoch 019: 745 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=376286, ups=0.76, wpb=495182, bsz=16708.2, num_updates=31100, lr=0.000358633, gnorm=0.174, clip=0, loss_scale=4, train_wall=92, gb_free=22.3, wall=28801 epoch 019: 745 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=376286, ups=0.76, wpb=495182, bsz=16708.2, num_updates=31100, lr=0.000358633, gnorm=0.174, clip=0, loss_scale=4, train_wall=92, gb_free=22.3, wall=28801 epoch 019: 745 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=376286, ups=0.76, wpb=495182, bsz=16708.2, num_updates=31100, lr=0.000358633, gnorm=0.174, clip=0, loss_scale=4, train_wall=92, gb_free=22.3, wall=28801 epoch 019: 845 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=561159, ups=1.13, wpb=494695, bsz=16404.5, num_updates=31200, lr=0.000358057, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=28890 epoch 019: 845 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=561159, ups=1.13, wpb=494695, bsz=16404.5, num_updates=31200, lr=0.000358057, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=28890 epoch 019: 845 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=561159, ups=1.13, wpb=494695, bsz=16404.5, num_updates=31200, lr=0.000358057, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=28890 epoch 019: 845 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=561159, ups=1.13, wpb=494695, bsz=16404.5, num_updates=31200, lr=0.000358057, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=28890 epoch 019: 845 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=561159, ups=1.13, wpb=494695, bsz=16404.5, num_updates=31200, lr=0.000358057, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=28890 epoch 019: 845 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=561159, ups=1.13, wpb=494695, bsz=16404.5, num_updates=31200, lr=0.000358057, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=28890 epoch 019: 845 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=561159, ups=1.13, wpb=494695, bsz=16404.5, num_updates=31200, lr=0.000358057, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=28890 epoch 019: 845 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=561159, ups=1.13, wpb=494695, bsz=16404.5, num_updates=31200, lr=0.000358057, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=28890 epoch 019: 845 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=561159, ups=1.13, wpb=494695, bsz=16404.5, num_updates=31200, lr=0.000358057, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=28890 epoch 019: 845 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=561159, ups=1.13, wpb=494695, bsz=16404.5, num_updates=31200, lr=0.000358057, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=28890 epoch 019: 845 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=561159, ups=1.13, wpb=494695, bsz=16404.5, num_updates=31200, lr=0.000358057, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=28890 epoch 019: 845 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=561159, ups=1.13, wpb=494695, bsz=16404.5, num_updates=31200, lr=0.000358057, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=28890 epoch 019: 845 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=561159, ups=1.13, wpb=494695, bsz=16404.5, num_updates=31200, lr=0.000358057, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=28890 epoch 019: 845 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=561159, ups=1.13, wpb=494695, bsz=16404.5, num_updates=31200, lr=0.000358057, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=28890 epoch 019: 845 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=561159, ups=1.13, wpb=494695, bsz=16404.5, num_updates=31200, lr=0.000358057, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=28890 epoch 019: 845 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=561159, ups=1.13, wpb=494695, bsz=16404.5, num_updates=31200, lr=0.000358057, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=28890 epoch 019: 845 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=561159, ups=1.13, wpb=494695, bsz=16404.5, num_updates=31200, lr=0.000358057, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=28890 epoch 019: 845 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=561159, ups=1.13, wpb=494695, bsz=16404.5, num_updates=31200, lr=0.000358057, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=28890 epoch 019: 845 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=561159, ups=1.13, wpb=494695, bsz=16404.5, num_updates=31200, lr=0.000358057, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=28890 epoch 019: 945 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=556819, ups=1.12, wpb=496369, bsz=16257.6, num_updates=31300, lr=0.000357485, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=28979 epoch 019: 945 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=556819, ups=1.12, wpb=496369, bsz=16257.6, num_updates=31300, lr=0.000357485, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=28979 epoch 019: 945 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=556819, ups=1.12, wpb=496369, bsz=16257.6, num_updates=31300, lr=0.000357485, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=28979 epoch 019: 945 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=556819, ups=1.12, wpb=496369, bsz=16257.6, num_updates=31300, lr=0.000357485, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=28979 epoch 019: 945 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=556819, ups=1.12, wpb=496369, bsz=16257.6, num_updates=31300, lr=0.000357485, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=28979 epoch 019: 945 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=556819, ups=1.12, wpb=496369, bsz=16257.6, num_updates=31300, lr=0.000357485, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=28979 epoch 019: 945 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=556819, ups=1.12, wpb=496369, bsz=16257.6, num_updates=31300, lr=0.000357485, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=28979 epoch 019: 945 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=556819, ups=1.12, wpb=496369, bsz=16257.6, num_updates=31300, lr=0.000357485, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=28979 epoch 019: 945 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=556819, ups=1.12, wpb=496369, bsz=16257.6, num_updates=31300, lr=0.000357485, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=28979 epoch 019: 945 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=556819, ups=1.12, wpb=496369, bsz=16257.6, num_updates=31300, lr=0.000357485, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=28979 epoch 019: 945 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=556819, ups=1.12, wpb=496369, bsz=16257.6, num_updates=31300, lr=0.000357485, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=28979 epoch 019: 945 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=556819, ups=1.12, wpb=496369, bsz=16257.6, num_updates=31300, lr=0.000357485, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=28979 epoch 019: 945 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=556819, ups=1.12, wpb=496369, bsz=16257.6, num_updates=31300, lr=0.000357485, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=28979 epoch 019: 945 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=556819, ups=1.12, wpb=496369, bsz=16257.6, num_updates=31300, lr=0.000357485, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=28979 epoch 019: 945 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=556819, ups=1.12, wpb=496369, bsz=16257.6, num_updates=31300, lr=0.000357485, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=28979 epoch 019: 945 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=556819, ups=1.12, wpb=496369, bsz=16257.6, num_updates=31300, lr=0.000357485, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=28979 epoch 019: 945 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=556819, ups=1.12, wpb=496369, bsz=16257.6, num_updates=31300, lr=0.000357485, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=28979 epoch 019: 945 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=556819, ups=1.12, wpb=496369, bsz=16257.6, num_updates=31300, lr=0.000357485, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=28979 epoch 019: 945 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=556819, ups=1.12, wpb=496369, bsz=16257.6, num_updates=31300, lr=0.000357485, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=28979 epoch 019: 1046 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=545062, ups=1.1, wpb=494279, bsz=16375.4, num_updates=31400, lr=0.000356915, gnorm=0.189, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=29069 epoch 019: 1046 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=545062, ups=1.1, wpb=494279, bsz=16375.4, num_updates=31400, lr=0.000356915, gnorm=0.189, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=29069 epoch 019: 1046 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=545062, ups=1.1, wpb=494279, bsz=16375.4, num_updates=31400, lr=0.000356915, gnorm=0.189, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=29069 epoch 019: 1046 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=545062, ups=1.1, wpb=494279, bsz=16375.4, num_updates=31400, lr=0.000356915, gnorm=0.189, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=29069 epoch 019: 1046 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=545062, ups=1.1, wpb=494279, bsz=16375.4, num_updates=31400, lr=0.000356915, gnorm=0.189, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=29069 epoch 019: 1046 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=545062, ups=1.1, wpb=494279, bsz=16375.4, num_updates=31400, lr=0.000356915, gnorm=0.189, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=29069 epoch 019: 1046 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=545062, ups=1.1, wpb=494279, bsz=16375.4, num_updates=31400, lr=0.000356915, gnorm=0.189, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=29069 epoch 019: 1046 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=545062, ups=1.1, wpb=494279, bsz=16375.4, num_updates=31400, lr=0.000356915, gnorm=0.189, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=29069 epoch 019: 1046 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=545062, ups=1.1, wpb=494279, bsz=16375.4, num_updates=31400, lr=0.000356915, gnorm=0.189, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=29069 epoch 019: 1046 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=545062, ups=1.1, wpb=494279, bsz=16375.4, num_updates=31400, lr=0.000356915, gnorm=0.189, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=29069 epoch 019: 1046 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=545062, ups=1.1, wpb=494279, bsz=16375.4, num_updates=31400, lr=0.000356915, gnorm=0.189, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=29069 epoch 019: 1046 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=545062, ups=1.1, wpb=494279, bsz=16375.4, num_updates=31400, lr=0.000356915, gnorm=0.189, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=29069 epoch 019: 1046 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=545062, ups=1.1, wpb=494279, bsz=16375.4, num_updates=31400, lr=0.000356915, gnorm=0.189, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=29069 epoch 019: 1046 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=545062, ups=1.1, wpb=494279, bsz=16375.4, num_updates=31400, lr=0.000356915, gnorm=0.189, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=29069 epoch 019: 1046 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=545062, ups=1.1, wpb=494279, bsz=16375.4, num_updates=31400, lr=0.000356915, gnorm=0.189, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=29069 epoch 019: 1046 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=545062, ups=1.1, wpb=494279, bsz=16375.4, num_updates=31400, lr=0.000356915, gnorm=0.189, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=29069 epoch 019: 1046 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=545062, ups=1.1, wpb=494279, bsz=16375.4, num_updates=31400, lr=0.000356915, gnorm=0.189, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=29069 epoch 019: 1046 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=545062, ups=1.1, wpb=494279, bsz=16375.4, num_updates=31400, lr=0.000356915, gnorm=0.189, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=29069 epoch 019: 1046 / 1689 loss=3.729, nll_loss=2.198, ppl=4.59, wps=545062, ups=1.1, wpb=494279, bsz=16375.4, num_updates=31400, lr=0.000356915, gnorm=0.189, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=29069 epoch 019: 1146 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=552623, ups=1.12, wpb=493574, bsz=16341.2, num_updates=31500, lr=0.000356348, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29159 epoch 019: 1146 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=552623, ups=1.12, wpb=493574, bsz=16341.2, num_updates=31500, lr=0.000356348, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29159 epoch 019: 1146 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=552623, ups=1.12, wpb=493574, bsz=16341.2, num_updates=31500, lr=0.000356348, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29159 epoch 019: 1146 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=552623, ups=1.12, wpb=493574, bsz=16341.2, num_updates=31500, lr=0.000356348, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29159 epoch 019: 1146 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=552623, ups=1.12, wpb=493574, bsz=16341.2, num_updates=31500, lr=0.000356348, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29159 epoch 019: 1146 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=552623, ups=1.12, wpb=493574, bsz=16341.2, num_updates=31500, lr=0.000356348, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29159 epoch 019: 1146 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=552623, ups=1.12, wpb=493574, bsz=16341.2, num_updates=31500, lr=0.000356348, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29159 epoch 019: 1146 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=552623, ups=1.12, wpb=493574, bsz=16341.2, num_updates=31500, lr=0.000356348, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29159 epoch 019: 1146 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=552623, ups=1.12, wpb=493574, bsz=16341.2, num_updates=31500, lr=0.000356348, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29159 epoch 019: 1146 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=552623, ups=1.12, wpb=493574, bsz=16341.2, num_updates=31500, lr=0.000356348, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29159 epoch 019: 1146 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=552623, ups=1.12, wpb=493574, bsz=16341.2, num_updates=31500, lr=0.000356348, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29159 epoch 019: 1146 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=552623, ups=1.12, wpb=493574, bsz=16341.2, num_updates=31500, lr=0.000356348, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29159 epoch 019: 1146 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=552623, ups=1.12, wpb=493574, bsz=16341.2, num_updates=31500, lr=0.000356348, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29159 epoch 019: 1146 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=552623, ups=1.12, wpb=493574, bsz=16341.2, num_updates=31500, lr=0.000356348, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29159 epoch 019: 1146 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=552623, ups=1.12, wpb=493574, bsz=16341.2, num_updates=31500, lr=0.000356348, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29159 epoch 019: 1146 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=552623, ups=1.12, wpb=493574, bsz=16341.2, num_updates=31500, lr=0.000356348, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29159 epoch 019: 1146 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=552623, ups=1.12, wpb=493574, bsz=16341.2, num_updates=31500, lr=0.000356348, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29159 epoch 019: 1146 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=552623, ups=1.12, wpb=493574, bsz=16341.2, num_updates=31500, lr=0.000356348, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29159 epoch 019: 1146 / 1689 loss=3.734, nll_loss=2.203, ppl=4.6, wps=552623, ups=1.12, wpb=493574, bsz=16341.2, num_updates=31500, lr=0.000356348, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29159 epoch 019: 1246 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=548046, ups=1.11, wpb=494182, bsz=16823.3, num_updates=31600, lr=0.000355784, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29249 epoch 019: 1246 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=548046, ups=1.11, wpb=494182, bsz=16823.3, num_updates=31600, lr=0.000355784, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29249 epoch 019: 1246 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=548046, ups=1.11, wpb=494182, bsz=16823.3, num_updates=31600, lr=0.000355784, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29249 epoch 019: 1246 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=548046, ups=1.11, wpb=494182, bsz=16823.3, num_updates=31600, lr=0.000355784, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29249 epoch 019: 1246 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=548046, ups=1.11, wpb=494182, bsz=16823.3, num_updates=31600, lr=0.000355784, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29249 epoch 019: 1246 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=548046, ups=1.11, wpb=494182, bsz=16823.3, num_updates=31600, lr=0.000355784, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29249 epoch 019: 1246 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=548046, ups=1.11, wpb=494182, bsz=16823.3, num_updates=31600, lr=0.000355784, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29249 epoch 019: 1246 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=548046, ups=1.11, wpb=494182, bsz=16823.3, num_updates=31600, lr=0.000355784, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29249 epoch 019: 1246 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=548046, ups=1.11, wpb=494182, bsz=16823.3, num_updates=31600, lr=0.000355784, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29249 epoch 019: 1246 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=548046, ups=1.11, wpb=494182, bsz=16823.3, num_updates=31600, lr=0.000355784, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29249 epoch 019: 1246 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=548046, ups=1.11, wpb=494182, bsz=16823.3, num_updates=31600, lr=0.000355784, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29249 epoch 019: 1246 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=548046, ups=1.11, wpb=494182, bsz=16823.3, num_updates=31600, lr=0.000355784, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29249 epoch 019: 1246 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=548046, ups=1.11, wpb=494182, bsz=16823.3, num_updates=31600, lr=0.000355784, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29249 epoch 019: 1246 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=548046, ups=1.11, wpb=494182, bsz=16823.3, num_updates=31600, lr=0.000355784, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29249 epoch 019: 1246 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=548046, ups=1.11, wpb=494182, bsz=16823.3, num_updates=31600, lr=0.000355784, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29249 epoch 019: 1246 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=548046, ups=1.11, wpb=494182, bsz=16823.3, num_updates=31600, lr=0.000355784, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29249 epoch 019: 1246 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=548046, ups=1.11, wpb=494182, bsz=16823.3, num_updates=31600, lr=0.000355784, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29249 epoch 019: 1246 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=548046, ups=1.11, wpb=494182, bsz=16823.3, num_updates=31600, lr=0.000355784, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29249 epoch 019: 1246 / 1689 loss=3.737, nll_loss=2.206, ppl=4.61, wps=548046, ups=1.11, wpb=494182, bsz=16823.3, num_updates=31600, lr=0.000355784, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=29249 epoch 019: 1346 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=553252, ups=1.12, wpb=495349, bsz=16707.6, num_updates=31700, lr=0.000355222, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=29338 epoch 019: 1346 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=553252, ups=1.12, wpb=495349, bsz=16707.6, num_updates=31700, lr=0.000355222, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=29338 epoch 019: 1346 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=553252, ups=1.12, wpb=495349, bsz=16707.6, num_updates=31700, lr=0.000355222, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=29338 epoch 019: 1346 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=553252, ups=1.12, wpb=495349, bsz=16707.6, num_updates=31700, lr=0.000355222, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=29338 epoch 019: 1346 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=553252, ups=1.12, wpb=495349, bsz=16707.6, num_updates=31700, lr=0.000355222, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=29338 epoch 019: 1346 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=553252, ups=1.12, wpb=495349, bsz=16707.6, num_updates=31700, lr=0.000355222, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=29338 epoch 019: 1346 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=553252, ups=1.12, wpb=495349, bsz=16707.6, num_updates=31700, lr=0.000355222, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=29338 epoch 019: 1346 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=553252, ups=1.12, wpb=495349, bsz=16707.6, num_updates=31700, lr=0.000355222, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=29338 epoch 019: 1346 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=553252, ups=1.12, wpb=495349, bsz=16707.6, num_updates=31700, lr=0.000355222, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=29338 epoch 019: 1346 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=553252, ups=1.12, wpb=495349, bsz=16707.6, num_updates=31700, lr=0.000355222, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=29338 epoch 019: 1346 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=553252, ups=1.12, wpb=495349, bsz=16707.6, num_updates=31700, lr=0.000355222, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=29338 epoch 019: 1346 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=553252, ups=1.12, wpb=495349, bsz=16707.6, num_updates=31700, lr=0.000355222, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=29338 epoch 019: 1346 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=553252, ups=1.12, wpb=495349, bsz=16707.6, num_updates=31700, lr=0.000355222, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=29338 epoch 019: 1346 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=553252, ups=1.12, wpb=495349, bsz=16707.6, num_updates=31700, lr=0.000355222, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=29338 epoch 019: 1346 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=553252, ups=1.12, wpb=495349, bsz=16707.6, num_updates=31700, lr=0.000355222, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=29338 epoch 019: 1346 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=553252, ups=1.12, wpb=495349, bsz=16707.6, num_updates=31700, lr=0.000355222, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=29338 epoch 019: 1346 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=553252, ups=1.12, wpb=495349, bsz=16707.6, num_updates=31700, lr=0.000355222, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=29338 epoch 019: 1346 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=553252, ups=1.12, wpb=495349, bsz=16707.6, num_updates=31700, lr=0.000355222, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=29338 epoch 019: 1346 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=553252, ups=1.12, wpb=495349, bsz=16707.6, num_updates=31700, lr=0.000355222, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=29338 epoch 019: 1446 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=550799, ups=1.11, wpb=495088, bsz=16298.3, num_updates=31800, lr=0.000354663, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=29428 epoch 019: 1446 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=550799, ups=1.11, wpb=495088, bsz=16298.3, num_updates=31800, lr=0.000354663, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=29428 epoch 019: 1446 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=550799, ups=1.11, wpb=495088, bsz=16298.3, num_updates=31800, lr=0.000354663, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=29428 epoch 019: 1446 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=550799, ups=1.11, wpb=495088, bsz=16298.3, num_updates=31800, lr=0.000354663, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=29428 epoch 019: 1446 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=550799, ups=1.11, wpb=495088, bsz=16298.3, num_updates=31800, lr=0.000354663, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=29428 epoch 019: 1446 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=550799, ups=1.11, wpb=495088, bsz=16298.3, num_updates=31800, lr=0.000354663, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=29428 epoch 019: 1446 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=550799, ups=1.11, wpb=495088, bsz=16298.3, num_updates=31800, lr=0.000354663, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=29428 epoch 019: 1446 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=550799, ups=1.11, wpb=495088, bsz=16298.3, num_updates=31800, lr=0.000354663, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=29428 epoch 019: 1446 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=550799, ups=1.11, wpb=495088, bsz=16298.3, num_updates=31800, lr=0.000354663, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=29428 epoch 019: 1446 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=550799, ups=1.11, wpb=495088, bsz=16298.3, num_updates=31800, lr=0.000354663, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=29428 epoch 019: 1446 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=550799, ups=1.11, wpb=495088, bsz=16298.3, num_updates=31800, lr=0.000354663, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=29428 epoch 019: 1446 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=550799, ups=1.11, wpb=495088, bsz=16298.3, num_updates=31800, lr=0.000354663, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=29428 epoch 019: 1446 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=550799, ups=1.11, wpb=495088, bsz=16298.3, num_updates=31800, lr=0.000354663, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=29428 epoch 019: 1446 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=550799, ups=1.11, wpb=495088, bsz=16298.3, num_updates=31800, lr=0.000354663, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=29428 epoch 019: 1446 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=550799, ups=1.11, wpb=495088, bsz=16298.3, num_updates=31800, lr=0.000354663, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=29428 epoch 019: 1446 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=550799, ups=1.11, wpb=495088, bsz=16298.3, num_updates=31800, lr=0.000354663, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=29428 epoch 019: 1446 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=550799, ups=1.11, wpb=495088, bsz=16298.3, num_updates=31800, lr=0.000354663, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=29428 epoch 019: 1446 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=550799, ups=1.11, wpb=495088, bsz=16298.3, num_updates=31800, lr=0.000354663, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=29428 epoch 019: 1446 / 1689 loss=3.727, nll_loss=2.195, ppl=4.58, wps=550799, ups=1.11, wpb=495088, bsz=16298.3, num_updates=31800, lr=0.000354663, gnorm=0.181, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=29428 epoch 019: 1546 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=547103, ups=1.1, wpb=495414, bsz=16188.5, num_updates=31900, lr=0.000354107, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=29519 epoch 019: 1546 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=547103, ups=1.1, wpb=495414, bsz=16188.5, num_updates=31900, lr=0.000354107, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=29519 epoch 019: 1546 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=547103, ups=1.1, wpb=495414, bsz=16188.5, num_updates=31900, lr=0.000354107, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=29519 epoch 019: 1546 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=547103, ups=1.1, wpb=495414, bsz=16188.5, num_updates=31900, lr=0.000354107, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=29519 epoch 019: 1546 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=547103, ups=1.1, wpb=495414, bsz=16188.5, num_updates=31900, lr=0.000354107, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=29519 epoch 019: 1546 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=547103, ups=1.1, wpb=495414, bsz=16188.5, num_updates=31900, lr=0.000354107, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=29519 epoch 019: 1546 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=547103, ups=1.1, wpb=495414, bsz=16188.5, num_updates=31900, lr=0.000354107, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=29519 epoch 019: 1546 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=547103, ups=1.1, wpb=495414, bsz=16188.5, num_updates=31900, lr=0.000354107, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=29519 epoch 019: 1546 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=547103, ups=1.1, wpb=495414, bsz=16188.5, num_updates=31900, lr=0.000354107, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=29519 epoch 019: 1546 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=547103, ups=1.1, wpb=495414, bsz=16188.5, num_updates=31900, lr=0.000354107, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=29519 epoch 019: 1546 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=547103, ups=1.1, wpb=495414, bsz=16188.5, num_updates=31900, lr=0.000354107, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=29519 epoch 019: 1546 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=547103, ups=1.1, wpb=495414, bsz=16188.5, num_updates=31900, lr=0.000354107, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=29519 epoch 019: 1546 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=547103, ups=1.1, wpb=495414, bsz=16188.5, num_updates=31900, lr=0.000354107, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=29519 epoch 019: 1546 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=547103, ups=1.1, wpb=495414, bsz=16188.5, num_updates=31900, lr=0.000354107, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=29519 epoch 019: 1546 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=547103, ups=1.1, wpb=495414, bsz=16188.5, num_updates=31900, lr=0.000354107, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=29519 epoch 019: 1546 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=547103, ups=1.1, wpb=495414, bsz=16188.5, num_updates=31900, lr=0.000354107, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=29519 epoch 019: 1546 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=547103, ups=1.1, wpb=495414, bsz=16188.5, num_updates=31900, lr=0.000354107, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=29519 epoch 019: 1546 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=547103, ups=1.1, wpb=495414, bsz=16188.5, num_updates=31900, lr=0.000354107, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=29519 epoch 019: 1546 / 1689 loss=3.726, nll_loss=2.194, ppl=4.58, wps=547103, ups=1.1, wpb=495414, bsz=16188.5, num_updates=31900, lr=0.000354107, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=29519 epoch 019: 1646 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=547554, ups=1.11, wpb=495197, bsz=16572.2, num_updates=32000, lr=0.000353553, gnorm=0.18, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=29609 epoch 019: 1646 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=547554, ups=1.11, wpb=495197, bsz=16572.2, num_updates=32000, lr=0.000353553, gnorm=0.18, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=29609 epoch 019: 1646 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=547554, ups=1.11, wpb=495197, bsz=16572.2, num_updates=32000, lr=0.000353553, gnorm=0.18, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=29609 epoch 019: 1646 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=547554, ups=1.11, wpb=495197, bsz=16572.2, num_updates=32000, lr=0.000353553, gnorm=0.18, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=29609 epoch 019: 1646 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=547554, ups=1.11, wpb=495197, bsz=16572.2, num_updates=32000, lr=0.000353553, gnorm=0.18, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=29609 epoch 019: 1646 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=547554, ups=1.11, wpb=495197, bsz=16572.2, num_updates=32000, lr=0.000353553, gnorm=0.18, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=29609 epoch 019: 1646 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=547554, ups=1.11, wpb=495197, bsz=16572.2, num_updates=32000, lr=0.000353553, gnorm=0.18, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=29609 epoch 019: 1646 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=547554, ups=1.11, wpb=495197, bsz=16572.2, num_updates=32000, lr=0.000353553, gnorm=0.18, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=29609 epoch 019: 1646 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=547554, ups=1.11, wpb=495197, bsz=16572.2, num_updates=32000, lr=0.000353553, gnorm=0.18, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=29609 epoch 019: 1646 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=547554, ups=1.11, wpb=495197, bsz=16572.2, num_updates=32000, lr=0.000353553, gnorm=0.18, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=29609 epoch 019: 1646 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=547554, ups=1.11, wpb=495197, bsz=16572.2, num_updates=32000, lr=0.000353553, gnorm=0.18, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=29609 epoch 019: 1646 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=547554, ups=1.11, wpb=495197, bsz=16572.2, num_updates=32000, lr=0.000353553, gnorm=0.18, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=29609 epoch 019: 1646 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=547554, ups=1.11, wpb=495197, bsz=16572.2, num_updates=32000, lr=0.000353553, gnorm=0.18, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=29609 epoch 019: 1646 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=547554, ups=1.11, wpb=495197, bsz=16572.2, num_updates=32000, lr=0.000353553, gnorm=0.18, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=29609 epoch 019: 1646 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=547554, ups=1.11, wpb=495197, bsz=16572.2, num_updates=32000, lr=0.000353553, gnorm=0.18, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=29609 epoch 019: 1646 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=547554, ups=1.11, wpb=495197, bsz=16572.2, num_updates=32000, lr=0.000353553, gnorm=0.18, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=29609 epoch 019: 1646 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=547554, ups=1.11, wpb=495197, bsz=16572.2, num_updates=32000, lr=0.000353553, gnorm=0.18, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=29609 epoch 019: 1646 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=547554, ups=1.11, wpb=495197, bsz=16572.2, num_updates=32000, lr=0.000353553, gnorm=0.18, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=29609 epoch 019: 1646 / 1689 loss=3.73, nll_loss=2.199, ppl=4.59, wps=547554, ups=1.11, wpb=495197, bsz=16572.2, num_updates=32000, lr=0.000353553, gnorm=0.18, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=29609 begin validation on "valid" subset epoch 019 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.73 epoch 019 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 32000 | best_loss 3.73 end of epoch 19 (average epoch stats below) epoch 019 | loss 3.726 | nll_loss 2.194 | ppl 4.58 | wps 529308 | ups 1.07 | wpb 495122 | bsz 16507 | num_updates 32043 | lr 0.000353316 | gnorm 0.179 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 23.2 | wall 29666 epoch 019 | loss 3.726 | nll_loss 2.194 | ppl 4.58 | wps 529308 | ups 1.07 | wpb 495122 | bsz 16507 | num_updates 32043 | lr 0.000353316 | gnorm 0.179 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 23.2 | wall 29666 epoch 019 | loss 3.726 | nll_loss 2.194 | ppl 4.58 | wps 529308 | ups 1.07 | wpb 495122 | bsz 16507 | num_updates 32043 | lr 0.000353316 | gnorm 0.179 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 23.2 | wall 29666 epoch 019 | loss 3.726 | nll_loss 2.194 | ppl 4.58 | wps 529308 | ups 1.07 | wpb 495122 | bsz 16507 | num_updates 32043 | lr 0.000353316 | gnorm 0.179 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 23.2 | wall 29666 epoch 019 | loss 3.726 | nll_loss 2.194 | ppl 4.58 | wps 529308 | ups 1.07 | wpb 495122 | bsz 16507 | num_updates 32043 | lr 0.000353316 | gnorm 0.179 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 23.2 | wall 29666 epoch 019 | loss 3.726 | nll_loss 2.194 | ppl 4.58 | wps 529308 | ups 1.07 | wpb 495122 | bsz 16507 | num_updates 32043 | lr 0.000353316 | gnorm 0.179 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 23.2 | wall 29666 epoch 019 | loss 3.726 | nll_loss 2.194 | ppl 4.58 | wps 529308 | ups 1.07 | wpb 495122 | bsz 16507 | num_updates 32043 | lr 0.000353316 | gnorm 0.179 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 23.2 | wall 29666 epoch 019 | loss 3.726 | nll_loss 2.194 | ppl 4.58 | wps 529308 | ups 1.07 | wpb 495122 | bsz 16507 | num_updates 32043 | lr 0.000353316 | gnorm 0.179 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 23.2 | wall 29666 epoch 019 | loss 3.726 | nll_loss 2.194 | ppl 4.58 | wps 529308 | ups 1.07 | wpb 495122 | bsz 16507 | num_updates 32043 | lr 0.000353316 | gnorm 0.179 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 23.2 | wall 29666 epoch 019 | loss 3.726 | nll_loss 2.194 | ppl 4.58 | wps 529308 | ups 1.07 | wpb 495122 | bsz 16507 | num_updates 32043 | lr 0.000353316 | gnorm 0.179 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 23.2 | wall 29666 epoch 019 | loss 3.726 | nll_loss 2.194 | ppl 4.58 | wps 529308 | ups 1.07 | wpb 495122 | bsz 16507 | num_updates 32043 | lr 0.000353316 | gnorm 0.179 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 23.2 | wall 29666 epoch 019 | loss 3.726 | nll_loss 2.194 | ppl 4.58 | wps 529308 | ups 1.07 | wpb 495122 | bsz 16507 | num_updates 32043 | lr 0.000353316 | gnorm 0.179 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 23.2 | wall 29666 epoch 019 | loss 3.726 | nll_loss 2.194 | ppl 4.58 | wps 529308 | ups 1.07 | wpb 495122 | bsz 16507 | num_updates 32043 | lr 0.000353316 | gnorm 0.179 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 23.2 | wall 29666 epoch 019 | loss 3.726 | nll_loss 2.194 | ppl 4.58 | wps 529308 | ups 1.07 | wpb 495122 | bsz 16507 | num_updates 32043 | lr 0.000353316 | gnorm 0.179 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 23.2 | wall 29666 epoch 019 | loss 3.726 | nll_loss 2.194 | ppl 4.58 | wps 529308 | ups 1.07 | wpb 495122 | bsz 16507 | num_updates 32043 | lr 0.000353316 | gnorm 0.179 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 23.2 | wall 29666 epoch 019 | loss 3.726 | nll_loss 2.194 | ppl 4.58 | wps 529308 | ups 1.07 | wpb 495122 | bsz 16507 | num_updates 32043 | lr 0.000353316 | gnorm 0.179 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 23.2 | wall 29666 epoch 019 | loss 3.726 | nll_loss 2.194 | ppl 4.58 | wps 529308 | ups 1.07 | wpb 495122 | bsz 16507 | num_updates 32043 | lr 0.000353316 | gnorm 0.179 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 23.2 | wall 29666 epoch 019 | loss 3.726 | nll_loss 2.194 | ppl 4.58 | wps 529308 | ups 1.07 | wpb 495122 | bsz 16507 | num_updates 32043 | lr 0.000353316 | gnorm 0.179 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 23.2 | wall 29666 epoch 019 | loss 3.726 | nll_loss 2.194 | ppl 4.58 | wps 529308 | ups 1.07 | wpb 495122 | bsz 16507 | num_updates 32043 | lr 0.000353316 | gnorm 0.179 | clip 0 | loss_scale 4 | train_wall 1498 | gb_free 23.2 | wall 29666 Start iterating over samples epoch 020: 57 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=448591, ups=0.91, wpb=490830, bsz=16342.8, num_updates=32100, lr=0.000353002, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29719 epoch 020: 57 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=448591, ups=0.91, wpb=490830, bsz=16342.8, num_updates=32100, lr=0.000353002, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29719 epoch 020: 57 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=448591, ups=0.91, wpb=490830, bsz=16342.8, num_updates=32100, lr=0.000353002, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29719 epoch 020: 57 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=448591, ups=0.91, wpb=490830, bsz=16342.8, num_updates=32100, lr=0.000353002, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29719 epoch 020: 57 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=448591, ups=0.91, wpb=490830, bsz=16342.8, num_updates=32100, lr=0.000353002, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29719 epoch 020: 57 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=448591, ups=0.91, wpb=490830, bsz=16342.8, num_updates=32100, lr=0.000353002, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29719 epoch 020: 57 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=448591, ups=0.91, wpb=490830, bsz=16342.8, num_updates=32100, lr=0.000353002, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29719 epoch 020: 57 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=448591, ups=0.91, wpb=490830, bsz=16342.8, num_updates=32100, lr=0.000353002, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29719 epoch 020: 57 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=448591, ups=0.91, wpb=490830, bsz=16342.8, num_updates=32100, lr=0.000353002, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29719 epoch 020: 57 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=448591, ups=0.91, wpb=490830, bsz=16342.8, num_updates=32100, lr=0.000353002, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29719 epoch 020: 57 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=448591, ups=0.91, wpb=490830, bsz=16342.8, num_updates=32100, lr=0.000353002, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29719 epoch 020: 57 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=448591, ups=0.91, wpb=490830, bsz=16342.8, num_updates=32100, lr=0.000353002, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29719 epoch 020: 57 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=448591, ups=0.91, wpb=490830, bsz=16342.8, num_updates=32100, lr=0.000353002, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29719 epoch 020: 57 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=448591, ups=0.91, wpb=490830, bsz=16342.8, num_updates=32100, lr=0.000353002, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29719 epoch 020: 57 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=448591, ups=0.91, wpb=490830, bsz=16342.8, num_updates=32100, lr=0.000353002, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29719 epoch 020: 57 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=448591, ups=0.91, wpb=490830, bsz=16342.8, num_updates=32100, lr=0.000353002, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29719 epoch 020: 57 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=448591, ups=0.91, wpb=490830, bsz=16342.8, num_updates=32100, lr=0.000353002, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29719 epoch 020: 57 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=448591, ups=0.91, wpb=490830, bsz=16342.8, num_updates=32100, lr=0.000353002, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29719 epoch 020: 57 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=448591, ups=0.91, wpb=490830, bsz=16342.8, num_updates=32100, lr=0.000353002, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29719 epoch 020: 57 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=448591, ups=0.91, wpb=490830, bsz=16342.8, num_updates=32100, lr=0.000353002, gnorm=0.188, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=29719 epoch 020: 158 / 1689 loss=3.713, nll_loss=2.179, ppl=4.53, wps=547508, ups=1.1, wpb=495927, bsz=16135.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=29809 epoch 020: 158 / 1689 loss=3.713, nll_loss=2.179, ppl=4.53, wps=547508, ups=1.1, wpb=495927, bsz=16135.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=29809 epoch 020: 158 / 1689 loss=3.713, nll_loss=2.179, ppl=4.53, wps=547508, ups=1.1, wpb=495927, bsz=16135.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=29809 epoch 020: 158 / 1689 loss=3.713, nll_loss=2.179, ppl=4.53, wps=547508, ups=1.1, wpb=495927, bsz=16135.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=29809 epoch 020: 158 / 1689 loss=3.713, nll_loss=2.179, ppl=4.53, wps=547508, ups=1.1, wpb=495927, bsz=16135.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=29809 epoch 020: 158 / 1689 loss=3.713, nll_loss=2.179, ppl=4.53, wps=547508, ups=1.1, wpb=495927, bsz=16135.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=29809 epoch 020: 158 / 1689 loss=3.713, nll_loss=2.179, ppl=4.53, wps=547508, ups=1.1, wpb=495927, bsz=16135.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=29809 epoch 020: 158 / 1689 loss=3.713, nll_loss=2.179, ppl=4.53, wps=547508, ups=1.1, wpb=495927, bsz=16135.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=29809 epoch 020: 158 / 1689 loss=3.713, nll_loss=2.179, ppl=4.53, wps=547508, ups=1.1, wpb=495927, bsz=16135.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=29809 epoch 020: 158 / 1689 loss=3.713, nll_loss=2.179, ppl=4.53, wps=547508, ups=1.1, wpb=495927, bsz=16135.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=29809 epoch 020: 158 / 1689 loss=3.713, nll_loss=2.179, ppl=4.53, wps=547508, ups=1.1, wpb=495927, bsz=16135.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=29809 epoch 020: 158 / 1689 loss=3.713, nll_loss=2.179, ppl=4.53, wps=547508, ups=1.1, wpb=495927, bsz=16135.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=29809 epoch 020: 158 / 1689 loss=3.713, nll_loss=2.179, ppl=4.53, wps=547508, ups=1.1, wpb=495927, bsz=16135.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=29809 epoch 020: 158 / 1689 loss=3.713, nll_loss=2.179, ppl=4.53, wps=547508, ups=1.1, wpb=495927, bsz=16135.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=29809 epoch 020: 158 / 1689 loss=3.713, nll_loss=2.179, ppl=4.53, wps=547508, ups=1.1, wpb=495927, bsz=16135.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=29809 epoch 020: 158 / 1689 loss=3.713, nll_loss=2.179, ppl=4.53, wps=547508, ups=1.1, wpb=495927, bsz=16135.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=29809 epoch 020: 158 / 1689 loss=3.713, nll_loss=2.179, ppl=4.53, wps=547508, ups=1.1, wpb=495927, bsz=16135.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=29809 epoch 020: 158 / 1689 loss=3.713, nll_loss=2.179, ppl=4.53, wps=547508, ups=1.1, wpb=495927, bsz=16135.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=29809 epoch 020: 158 / 1689 loss=3.713, nll_loss=2.179, ppl=4.53, wps=547508, ups=1.1, wpb=495927, bsz=16135.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=29809 epoch 020: 158 / 1689 loss=3.713, nll_loss=2.179, ppl=4.53, wps=547508, ups=1.1, wpb=495927, bsz=16135.8, num_updates=32200, lr=0.000352454, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=20.6, wall=29809 epoch 020: 258 / 1689 loss=3.715, nll_loss=2.181, ppl=4.53, wps=554763, ups=1.12, wpb=495025, bsz=16457.7, num_updates=32300, lr=0.000351908, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=29899 epoch 020: 258 / 1689 loss=3.715, nll_loss=2.181, ppl=4.53, wps=554763, ups=1.12, wpb=495025, bsz=16457.7, num_updates=32300, lr=0.000351908, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=29899 epoch 020: 258 / 1689 loss=3.715, nll_loss=2.181, ppl=4.53, wps=554763, ups=1.12, wpb=495025, bsz=16457.7, num_updates=32300, lr=0.000351908, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=29899 epoch 020: 258 / 1689 loss=3.715, nll_loss=2.181, ppl=4.53, wps=554763, ups=1.12, wpb=495025, bsz=16457.7, num_updates=32300, lr=0.000351908, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=29899 epoch 020: 258 / 1689 loss=3.715, nll_loss=2.181, ppl=4.53, wps=554763, ups=1.12, wpb=495025, bsz=16457.7, num_updates=32300, lr=0.000351908, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=29899 epoch 020: 258 / 1689 loss=3.715, nll_loss=2.181, ppl=4.53, wps=554763, ups=1.12, wpb=495025, bsz=16457.7, num_updates=32300, lr=0.000351908, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=29899 epoch 020: 258 / 1689 loss=3.715, nll_loss=2.181, ppl=4.53, wps=554763, ups=1.12, wpb=495025, bsz=16457.7, num_updates=32300, lr=0.000351908, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=29899 epoch 020: 258 / 1689 loss=3.715, nll_loss=2.181, ppl=4.53, wps=554763, ups=1.12, wpb=495025, bsz=16457.7, num_updates=32300, lr=0.000351908, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=29899 epoch 020: 258 / 1689 loss=3.715, nll_loss=2.181, ppl=4.53, wps=554763, ups=1.12, wpb=495025, bsz=16457.7, num_updates=32300, lr=0.000351908, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=29899 epoch 020: 258 / 1689 loss=3.715, nll_loss=2.181, ppl=4.53, wps=554763, ups=1.12, wpb=495025, bsz=16457.7, num_updates=32300, lr=0.000351908, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=29899 epoch 020: 258 / 1689 loss=3.715, nll_loss=2.181, ppl=4.53, wps=554763, ups=1.12, wpb=495025, bsz=16457.7, num_updates=32300, lr=0.000351908, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=29899 epoch 020: 258 / 1689 loss=3.715, nll_loss=2.181, ppl=4.53, wps=554763, ups=1.12, wpb=495025, bsz=16457.7, num_updates=32300, lr=0.000351908, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=29899 epoch 020: 258 / 1689 loss=3.715, nll_loss=2.181, ppl=4.53, wps=554763, ups=1.12, wpb=495025, bsz=16457.7, num_updates=32300, lr=0.000351908, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=29899 epoch 020: 258 / 1689 loss=3.715, nll_loss=2.181, ppl=4.53, wps=554763, ups=1.12, wpb=495025, bsz=16457.7, num_updates=32300, lr=0.000351908, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=29899 epoch 020: 258 / 1689 loss=3.715, nll_loss=2.181, ppl=4.53, wps=554763, ups=1.12, wpb=495025, bsz=16457.7, num_updates=32300, lr=0.000351908, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=29899 epoch 020: 258 / 1689 loss=3.715, nll_loss=2.181, ppl=4.53, wps=554763, ups=1.12, wpb=495025, bsz=16457.7, num_updates=32300, lr=0.000351908, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=29899 epoch 020: 258 / 1689 loss=3.715, nll_loss=2.181, ppl=4.53, wps=554763, ups=1.12, wpb=495025, bsz=16457.7, num_updates=32300, lr=0.000351908, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=29899 epoch 020: 258 / 1689 loss=3.715, nll_loss=2.181, ppl=4.53, wps=554763, ups=1.12, wpb=495025, bsz=16457.7, num_updates=32300, lr=0.000351908, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=29899 epoch 020: 258 / 1689 loss=3.715, nll_loss=2.181, ppl=4.53, wps=554763, ups=1.12, wpb=495025, bsz=16457.7, num_updates=32300, lr=0.000351908, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=29899 epoch 020: 258 / 1689 loss=3.715, nll_loss=2.181, ppl=4.53, wps=554763, ups=1.12, wpb=495025, bsz=16457.7, num_updates=32300, lr=0.000351908, gnorm=0.189, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=29899 epoch 020: 358 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=555598, ups=1.12, wpb=495064, bsz=16625.7, num_updates=32400, lr=0.000351364, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=29988 epoch 020: 358 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=555598, ups=1.12, wpb=495064, bsz=16625.7, num_updates=32400, lr=0.000351364, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=29988 epoch 020: 358 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=555598, ups=1.12, wpb=495064, bsz=16625.7, num_updates=32400, lr=0.000351364, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=29988 epoch 020: 358 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=555598, ups=1.12, wpb=495064, bsz=16625.7, num_updates=32400, lr=0.000351364, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=29988 epoch 020: 358 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=555598, ups=1.12, wpb=495064, bsz=16625.7, num_updates=32400, lr=0.000351364, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=29988 epoch 020: 358 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=555598, ups=1.12, wpb=495064, bsz=16625.7, num_updates=32400, lr=0.000351364, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=29988 epoch 020: 358 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=555598, ups=1.12, wpb=495064, bsz=16625.7, num_updates=32400, lr=0.000351364, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=29988 epoch 020: 358 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=555598, ups=1.12, wpb=495064, bsz=16625.7, num_updates=32400, lr=0.000351364, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=29988 epoch 020: 358 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=555598, ups=1.12, wpb=495064, bsz=16625.7, num_updates=32400, lr=0.000351364, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=29988 epoch 020: 358 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=555598, ups=1.12, wpb=495064, bsz=16625.7, num_updates=32400, lr=0.000351364, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=29988 epoch 020: 358 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=555598, ups=1.12, wpb=495064, bsz=16625.7, num_updates=32400, lr=0.000351364, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=29988 epoch 020: 358 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=555598, ups=1.12, wpb=495064, bsz=16625.7, num_updates=32400, lr=0.000351364, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=29988 epoch 020: 358 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=555598, ups=1.12, wpb=495064, bsz=16625.7, num_updates=32400, lr=0.000351364, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=29988 epoch 020: 358 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=555598, ups=1.12, wpb=495064, bsz=16625.7, num_updates=32400, lr=0.000351364, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=29988 epoch 020: 358 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=555598, ups=1.12, wpb=495064, bsz=16625.7, num_updates=32400, lr=0.000351364, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=29988 epoch 020: 358 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=555598, ups=1.12, wpb=495064, bsz=16625.7, num_updates=32400, lr=0.000351364, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=29988 epoch 020: 358 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=555598, ups=1.12, wpb=495064, bsz=16625.7, num_updates=32400, lr=0.000351364, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=29988 epoch 020: 358 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=555598, ups=1.12, wpb=495064, bsz=16625.7, num_updates=32400, lr=0.000351364, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=29988 epoch 020: 358 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=555598, ups=1.12, wpb=495064, bsz=16625.7, num_updates=32400, lr=0.000351364, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=29988 epoch 020: 358 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=555598, ups=1.12, wpb=495064, bsz=16625.7, num_updates=32400, lr=0.000351364, gnorm=0.173, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=29988 epoch 020: 458 / 1689 loss=3.712, nll_loss=2.178, ppl=4.52, wps=560340, ups=1.13, wpb=497389, bsz=16758.5, num_updates=32500, lr=0.000350823, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=30076 epoch 020: 458 / 1689 loss=3.712, nll_loss=2.178, ppl=4.52, wps=560340, ups=1.13, wpb=497389, bsz=16758.5, num_updates=32500, lr=0.000350823, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=30076 epoch 020: 458 / 1689 loss=3.712, nll_loss=2.178, ppl=4.52, wps=560340, ups=1.13, wpb=497389, bsz=16758.5, num_updates=32500, lr=0.000350823, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=30076 epoch 020: 458 / 1689 loss=3.712, nll_loss=2.178, ppl=4.52, wps=560340, ups=1.13, wpb=497389, bsz=16758.5, num_updates=32500, lr=0.000350823, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=30076 epoch 020: 458 / 1689 loss=3.712, nll_loss=2.178, ppl=4.52, wps=560340, ups=1.13, wpb=497389, bsz=16758.5, num_updates=32500, lr=0.000350823, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=30076 epoch 020: 458 / 1689 loss=3.712, nll_loss=2.178, ppl=4.52, wps=560340, ups=1.13, wpb=497389, bsz=16758.5, num_updates=32500, lr=0.000350823, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=30076 epoch 020: 458 / 1689 loss=3.712, nll_loss=2.178, ppl=4.52, wps=560340, ups=1.13, wpb=497389, bsz=16758.5, num_updates=32500, lr=0.000350823, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=30076 epoch 020: 458 / 1689 loss=3.712, nll_loss=2.178, ppl=4.52, wps=560340, ups=1.13, wpb=497389, bsz=16758.5, num_updates=32500, lr=0.000350823, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=30076 epoch 020: 458 / 1689 loss=3.712, nll_loss=2.178, ppl=4.52, wps=560340, ups=1.13, wpb=497389, bsz=16758.5, num_updates=32500, lr=0.000350823, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=30076 epoch 020: 458 / 1689 loss=3.712, nll_loss=2.178, ppl=4.52, wps=560340, ups=1.13, wpb=497389, bsz=16758.5, num_updates=32500, lr=0.000350823, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=30076 epoch 020: 458 / 1689 loss=3.712, nll_loss=2.178, ppl=4.52, wps=560340, ups=1.13, wpb=497389, bsz=16758.5, num_updates=32500, lr=0.000350823, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=30076 epoch 020: 458 / 1689 loss=3.712, nll_loss=2.178, ppl=4.52, wps=560340, ups=1.13, wpb=497389, bsz=16758.5, num_updates=32500, lr=0.000350823, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=30076 epoch 020: 458 / 1689 loss=3.712, nll_loss=2.178, ppl=4.52, wps=560340, ups=1.13, wpb=497389, bsz=16758.5, num_updates=32500, lr=0.000350823, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=30076 epoch 020: 458 / 1689 loss=3.712, nll_loss=2.178, ppl=4.52, wps=560340, ups=1.13, wpb=497389, bsz=16758.5, num_updates=32500, lr=0.000350823, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=30076 epoch 020: 458 / 1689 loss=3.712, nll_loss=2.178, ppl=4.52, wps=560340, ups=1.13, wpb=497389, bsz=16758.5, num_updates=32500, lr=0.000350823, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=30076 epoch 020: 458 / 1689 loss=3.712, nll_loss=2.178, ppl=4.52, wps=560340, ups=1.13, wpb=497389, bsz=16758.5, num_updates=32500, lr=0.000350823, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=30076 epoch 020: 458 / 1689 loss=3.712, nll_loss=2.178, ppl=4.52, wps=560340, ups=1.13, wpb=497389, bsz=16758.5, num_updates=32500, lr=0.000350823, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=30076 epoch 020: 458 / 1689 loss=3.712, nll_loss=2.178, ppl=4.52, wps=560340, ups=1.13, wpb=497389, bsz=16758.5, num_updates=32500, lr=0.000350823, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=30076 epoch 020: 458 / 1689 loss=3.712, nll_loss=2.178, ppl=4.52, wps=560340, ups=1.13, wpb=497389, bsz=16758.5, num_updates=32500, lr=0.000350823, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=30076 epoch 020: 458 / 1689 loss=3.712, nll_loss=2.178, ppl=4.52, wps=560340, ups=1.13, wpb=497389, bsz=16758.5, num_updates=32500, lr=0.000350823, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=30076 epoch 020: 558 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551212, ups=1.11, wpb=494948, bsz=16748.9, num_updates=32600, lr=0.000350285, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30166 epoch 020: 558 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551212, ups=1.11, wpb=494948, bsz=16748.9, num_updates=32600, lr=0.000350285, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30166 epoch 020: 558 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551212, ups=1.11, wpb=494948, bsz=16748.9, num_updates=32600, lr=0.000350285, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30166 epoch 020: 558 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551212, ups=1.11, wpb=494948, bsz=16748.9, num_updates=32600, lr=0.000350285, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30166 epoch 020: 558 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551212, ups=1.11, wpb=494948, bsz=16748.9, num_updates=32600, lr=0.000350285, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30166 epoch 020: 558 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551212, ups=1.11, wpb=494948, bsz=16748.9, num_updates=32600, lr=0.000350285, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30166 epoch 020: 558 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551212, ups=1.11, wpb=494948, bsz=16748.9, num_updates=32600, lr=0.000350285, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30166 epoch 020: 558 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551212, ups=1.11, wpb=494948, bsz=16748.9, num_updates=32600, lr=0.000350285, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30166 epoch 020: 558 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551212, ups=1.11, wpb=494948, bsz=16748.9, num_updates=32600, lr=0.000350285, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30166 epoch 020: 558 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551212, ups=1.11, wpb=494948, bsz=16748.9, num_updates=32600, lr=0.000350285, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30166 epoch 020: 558 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551212, ups=1.11, wpb=494948, bsz=16748.9, num_updates=32600, lr=0.000350285, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30166 epoch 020: 558 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551212, ups=1.11, wpb=494948, bsz=16748.9, num_updates=32600, lr=0.000350285, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30166 epoch 020: 558 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551212, ups=1.11, wpb=494948, bsz=16748.9, num_updates=32600, lr=0.000350285, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30166 epoch 020: 558 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551212, ups=1.11, wpb=494948, bsz=16748.9, num_updates=32600, lr=0.000350285, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30166 epoch 020: 558 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551212, ups=1.11, wpb=494948, bsz=16748.9, num_updates=32600, lr=0.000350285, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30166 epoch 020: 558 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551212, ups=1.11, wpb=494948, bsz=16748.9, num_updates=32600, lr=0.000350285, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30166 epoch 020: 558 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551212, ups=1.11, wpb=494948, bsz=16748.9, num_updates=32600, lr=0.000350285, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30166 epoch 020: 558 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551212, ups=1.11, wpb=494948, bsz=16748.9, num_updates=32600, lr=0.000350285, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30166 epoch 020: 558 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551212, ups=1.11, wpb=494948, bsz=16748.9, num_updates=32600, lr=0.000350285, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30166 epoch 020: 558 / 1689 loss=3.723, nll_loss=2.19, ppl=4.56, wps=551212, ups=1.11, wpb=494948, bsz=16748.9, num_updates=32600, lr=0.000350285, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=30166 epoch 020: 659 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=545780, ups=1.1, wpb=494881, bsz=16327.7, num_updates=32700, lr=0.000349749, gnorm=0.174, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=30257 epoch 020: 659 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=545780, ups=1.1, wpb=494881, bsz=16327.7, num_updates=32700, lr=0.000349749, gnorm=0.174, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=30257 epoch 020: 659 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=545780, ups=1.1, wpb=494881, bsz=16327.7, num_updates=32700, lr=0.000349749, gnorm=0.174, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=30257 epoch 020: 659 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=545780, ups=1.1, wpb=494881, bsz=16327.7, num_updates=32700, lr=0.000349749, gnorm=0.174, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=30257 epoch 020: 659 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=545780, ups=1.1, wpb=494881, bsz=16327.7, num_updates=32700, lr=0.000349749, gnorm=0.174, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=30257 epoch 020: 659 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=545780, ups=1.1, wpb=494881, bsz=16327.7, num_updates=32700, lr=0.000349749, gnorm=0.174, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=30257 epoch 020: 659 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=545780, ups=1.1, wpb=494881, bsz=16327.7, num_updates=32700, lr=0.000349749, gnorm=0.174, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=30257 epoch 020: 659 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=545780, ups=1.1, wpb=494881, bsz=16327.7, num_updates=32700, lr=0.000349749, gnorm=0.174, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=30257 epoch 020: 659 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=545780, ups=1.1, wpb=494881, bsz=16327.7, num_updates=32700, lr=0.000349749, gnorm=0.174, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=30257 epoch 020: 659 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=545780, ups=1.1, wpb=494881, bsz=16327.7, num_updates=32700, lr=0.000349749, gnorm=0.174, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=30257 epoch 020: 659 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=545780, ups=1.1, wpb=494881, bsz=16327.7, num_updates=32700, lr=0.000349749, gnorm=0.174, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=30257 epoch 020: 659 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=545780, ups=1.1, wpb=494881, bsz=16327.7, num_updates=32700, lr=0.000349749, gnorm=0.174, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=30257 epoch 020: 659 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=545780, ups=1.1, wpb=494881, bsz=16327.7, num_updates=32700, lr=0.000349749, gnorm=0.174, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=30257 epoch 020: 659 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=545780, ups=1.1, wpb=494881, bsz=16327.7, num_updates=32700, lr=0.000349749, gnorm=0.174, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=30257 epoch 020: 659 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=545780, ups=1.1, wpb=494881, bsz=16327.7, num_updates=32700, lr=0.000349749, gnorm=0.174, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=30257 epoch 020: 659 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=545780, ups=1.1, wpb=494881, bsz=16327.7, num_updates=32700, lr=0.000349749, gnorm=0.174, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=30257 epoch 020: 659 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=545780, ups=1.1, wpb=494881, bsz=16327.7, num_updates=32700, lr=0.000349749, gnorm=0.174, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=30257 epoch 020: 659 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=545780, ups=1.1, wpb=494881, bsz=16327.7, num_updates=32700, lr=0.000349749, gnorm=0.174, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=30257 epoch 020: 659 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=545780, ups=1.1, wpb=494881, bsz=16327.7, num_updates=32700, lr=0.000349749, gnorm=0.174, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=30257 epoch 020: 659 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=545780, ups=1.1, wpb=494881, bsz=16327.7, num_updates=32700, lr=0.000349749, gnorm=0.174, clip=0, loss_scale=2, train_wall=90, gb_free=21.4, wall=30257 epoch 020: 759 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=549893, ups=1.11, wpb=496803, bsz=17062.5, num_updates=32800, lr=0.000349215, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=30347 epoch 020: 759 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=549893, ups=1.11, wpb=496803, bsz=17062.5, num_updates=32800, lr=0.000349215, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=30347 epoch 020: 759 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=549893, ups=1.11, wpb=496803, bsz=17062.5, num_updates=32800, lr=0.000349215, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=30347 epoch 020: 759 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=549893, ups=1.11, wpb=496803, bsz=17062.5, num_updates=32800, lr=0.000349215, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=30347 epoch 020: 759 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=549893, ups=1.11, wpb=496803, bsz=17062.5, num_updates=32800, lr=0.000349215, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=30347 epoch 020: 759 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=549893, ups=1.11, wpb=496803, bsz=17062.5, num_updates=32800, lr=0.000349215, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=30347 epoch 020: 759 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=549893, ups=1.11, wpb=496803, bsz=17062.5, num_updates=32800, lr=0.000349215, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=30347 epoch 020: 759 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=549893, ups=1.11, wpb=496803, bsz=17062.5, num_updates=32800, lr=0.000349215, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=30347 epoch 020: 759 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=549893, ups=1.11, wpb=496803, bsz=17062.5, num_updates=32800, lr=0.000349215, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=30347 epoch 020: 759 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=549893, ups=1.11, wpb=496803, bsz=17062.5, num_updates=32800, lr=0.000349215, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=30347 epoch 020: 759 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=549893, ups=1.11, wpb=496803, bsz=17062.5, num_updates=32800, lr=0.000349215, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=30347 epoch 020: 759 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=549893, ups=1.11, wpb=496803, bsz=17062.5, num_updates=32800, lr=0.000349215, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=30347 epoch 020: 759 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=549893, ups=1.11, wpb=496803, bsz=17062.5, num_updates=32800, lr=0.000349215, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=30347 epoch 020: 759 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=549893, ups=1.11, wpb=496803, bsz=17062.5, num_updates=32800, lr=0.000349215, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=30347 epoch 020: 759 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=549893, ups=1.11, wpb=496803, bsz=17062.5, num_updates=32800, lr=0.000349215, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=30347 epoch 020: 759 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=549893, ups=1.11, wpb=496803, bsz=17062.5, num_updates=32800, lr=0.000349215, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=30347 epoch 020: 759 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=549893, ups=1.11, wpb=496803, bsz=17062.5, num_updates=32800, lr=0.000349215, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=30347 epoch 020: 759 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=549893, ups=1.11, wpb=496803, bsz=17062.5, num_updates=32800, lr=0.000349215, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=30347 epoch 020: 759 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=549893, ups=1.11, wpb=496803, bsz=17062.5, num_updates=32800, lr=0.000349215, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=30347 epoch 020: 759 / 1689 loss=3.721, nll_loss=2.188, ppl=4.56, wps=549893, ups=1.11, wpb=496803, bsz=17062.5, num_updates=32800, lr=0.000349215, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=30347 epoch 020: 859 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=547684, ups=1.11, wpb=493965, bsz=16355.1, num_updates=32900, lr=0.000348684, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=30437 epoch 020: 859 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=547684, ups=1.11, wpb=493965, bsz=16355.1, num_updates=32900, lr=0.000348684, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=30437 epoch 020: 859 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=547684, ups=1.11, wpb=493965, bsz=16355.1, num_updates=32900, lr=0.000348684, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=30437 epoch 020: 859 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=547684, ups=1.11, wpb=493965, bsz=16355.1, num_updates=32900, lr=0.000348684, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=30437 epoch 020: 859 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=547684, ups=1.11, wpb=493965, bsz=16355.1, num_updates=32900, lr=0.000348684, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=30437 epoch 020: 859 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=547684, ups=1.11, wpb=493965, bsz=16355.1, num_updates=32900, lr=0.000348684, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=30437 epoch 020: 859 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=547684, ups=1.11, wpb=493965, bsz=16355.1, num_updates=32900, lr=0.000348684, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=30437 epoch 020: 859 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=547684, ups=1.11, wpb=493965, bsz=16355.1, num_updates=32900, lr=0.000348684, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=30437 epoch 020: 859 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=547684, ups=1.11, wpb=493965, bsz=16355.1, num_updates=32900, lr=0.000348684, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=30437 epoch 020: 859 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=547684, ups=1.11, wpb=493965, bsz=16355.1, num_updates=32900, lr=0.000348684, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=30437 epoch 020: 859 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=547684, ups=1.11, wpb=493965, bsz=16355.1, num_updates=32900, lr=0.000348684, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=30437 epoch 020: 859 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=547684, ups=1.11, wpb=493965, bsz=16355.1, num_updates=32900, lr=0.000348684, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=30437 epoch 020: 859 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=547684, ups=1.11, wpb=493965, bsz=16355.1, num_updates=32900, lr=0.000348684, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=30437 epoch 020: 859 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=547684, ups=1.11, wpb=493965, bsz=16355.1, num_updates=32900, lr=0.000348684, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=30437 epoch 020: 859 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=547684, ups=1.11, wpb=493965, bsz=16355.1, num_updates=32900, lr=0.000348684, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=30437 epoch 020: 859 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=547684, ups=1.11, wpb=493965, bsz=16355.1, num_updates=32900, lr=0.000348684, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=30437 epoch 020: 859 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=547684, ups=1.11, wpb=493965, bsz=16355.1, num_updates=32900, lr=0.000348684, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=30437 epoch 020: 859 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=547684, ups=1.11, wpb=493965, bsz=16355.1, num_updates=32900, lr=0.000348684, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=30437 epoch 020: 859 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=547684, ups=1.11, wpb=493965, bsz=16355.1, num_updates=32900, lr=0.000348684, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=30437 epoch 020: 859 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=547684, ups=1.11, wpb=493965, bsz=16355.1, num_updates=32900, lr=0.000348684, gnorm=0.171, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=30437 epoch 020: 959 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=549786, ups=1.11, wpb=494632, bsz=16447.3, num_updates=33000, lr=0.000348155, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30527 epoch 020: 959 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=549786, ups=1.11, wpb=494632, bsz=16447.3, num_updates=33000, lr=0.000348155, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30527 epoch 020: 959 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=549786, ups=1.11, wpb=494632, bsz=16447.3, num_updates=33000, lr=0.000348155, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30527 epoch 020: 959 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=549786, ups=1.11, wpb=494632, bsz=16447.3, num_updates=33000, lr=0.000348155, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30527 epoch 020: 959 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=549786, ups=1.11, wpb=494632, bsz=16447.3, num_updates=33000, lr=0.000348155, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30527 epoch 020: 959 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=549786, ups=1.11, wpb=494632, bsz=16447.3, num_updates=33000, lr=0.000348155, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30527 epoch 020: 959 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=549786, ups=1.11, wpb=494632, bsz=16447.3, num_updates=33000, lr=0.000348155, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30527 epoch 020: 959 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=549786, ups=1.11, wpb=494632, bsz=16447.3, num_updates=33000, lr=0.000348155, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30527 epoch 020: 959 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=549786, ups=1.11, wpb=494632, bsz=16447.3, num_updates=33000, lr=0.000348155, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30527 epoch 020: 959 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=549786, ups=1.11, wpb=494632, bsz=16447.3, num_updates=33000, lr=0.000348155, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30527 epoch 020: 959 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=549786, ups=1.11, wpb=494632, bsz=16447.3, num_updates=33000, lr=0.000348155, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30527 epoch 020: 959 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=549786, ups=1.11, wpb=494632, bsz=16447.3, num_updates=33000, lr=0.000348155, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30527 epoch 020: 959 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=549786, ups=1.11, wpb=494632, bsz=16447.3, num_updates=33000, lr=0.000348155, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30527 epoch 020: 959 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=549786, ups=1.11, wpb=494632, bsz=16447.3, num_updates=33000, lr=0.000348155, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30527 epoch 020: 959 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=549786, ups=1.11, wpb=494632, bsz=16447.3, num_updates=33000, lr=0.000348155, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30527 epoch 020: 959 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=549786, ups=1.11, wpb=494632, bsz=16447.3, num_updates=33000, lr=0.000348155, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30527 epoch 020: 959 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=549786, ups=1.11, wpb=494632, bsz=16447.3, num_updates=33000, lr=0.000348155, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30527 epoch 020: 959 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=549786, ups=1.11, wpb=494632, bsz=16447.3, num_updates=33000, lr=0.000348155, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30527 epoch 020: 959 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=549786, ups=1.11, wpb=494632, bsz=16447.3, num_updates=33000, lr=0.000348155, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30527 epoch 020: 959 / 1689 loss=3.727, nll_loss=2.196, ppl=4.58, wps=549786, ups=1.11, wpb=494632, bsz=16447.3, num_updates=33000, lr=0.000348155, gnorm=0.18, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=30527 begin validation on "valid" subset epoch 020 | valid on 'valid' subset | loss 3.734 | nll_loss 2.168 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.73 epoch 020 | valid on 'valid' subset | loss 3.734 | nll_loss 2.168 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.73 epoch 020 | valid on 'valid' subset | loss 3.734 | nll_loss 2.168 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.73 epoch 020 | valid on 'valid' subset | loss 3.734 | nll_loss 2.168 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.73 epoch 020 | valid on 'valid' subset | loss 3.734 | nll_loss 2.168 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.73 epoch 020 | valid on 'valid' subset | loss 3.734 | nll_loss 2.168 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.73 epoch 020 | valid on 'valid' subset | loss 3.734 | nll_loss 2.168 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.73 epoch 020 | valid on 'valid' subset | loss 3.734 | nll_loss 2.168 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.73 epoch 020 | valid on 'valid' subset | loss 3.734 | nll_loss 2.168 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.73 epoch 020 | valid on 'valid' subset | loss 3.734 | nll_loss 2.168 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.73 epoch 020 | valid on 'valid' subset | loss 3.734 | nll_loss 2.168 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.73 epoch 020 | valid on 'valid' subset | loss 3.734 | nll_loss 2.168 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.73 epoch 020 | valid on 'valid' subset | loss 3.734 | nll_loss 2.168 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.73 epoch 020 | valid on 'valid' subset | loss 3.734 | nll_loss 2.168 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.73 epoch 020 | valid on 'valid' subset | loss 3.734 | nll_loss 2.168 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.73 epoch 020 | valid on 'valid' subset | loss 3.734 | nll_loss 2.168 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.73 epoch 020 | valid on 'valid' subset | loss 3.734 | nll_loss 2.168 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.73 epoch 020 | valid on 'valid' subset | loss 3.734 | nll_loss 2.168 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.73 epoch 020 | valid on 'valid' subset | loss 3.734 | nll_loss 2.168 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.73 epoch 020 | valid on 'valid' subset | loss 3.734 | nll_loss 2.168 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 33000 | best_loss 3.73 epoch 020: 1059 / 1689 loss=3.719, nll_loss=2.186, ppl=4.55, wps=360137, ups=0.73, wpb=494561, bsz=16271.1, num_updates=33100, lr=0.000347629, gnorm=0.183, clip=0, loss_scale=2, train_wall=119, gb_free=21.1, wall=30665 epoch 020: 1059 / 1689 loss=3.719, nll_loss=2.186, ppl=4.55, wps=360137, ups=0.73, wpb=494561, bsz=16271.1, num_updates=33100, lr=0.000347629, gnorm=0.183, clip=0, loss_scale=2, train_wall=119, gb_free=21.1, wall=30665 epoch 020: 1059 / 1689 loss=3.719, nll_loss=2.186, ppl=4.55, wps=360137, ups=0.73, wpb=494561, bsz=16271.1, num_updates=33100, lr=0.000347629, gnorm=0.183, clip=0, loss_scale=2, train_wall=119, gb_free=21.1, wall=30665 epoch 020: 1059 / 1689 loss=3.719, nll_loss=2.186, ppl=4.55, wps=360137, ups=0.73, wpb=494561, bsz=16271.1, num_updates=33100, lr=0.000347629, gnorm=0.183, clip=0, loss_scale=2, train_wall=119, gb_free=21.1, wall=30665 epoch 020: 1059 / 1689 loss=3.719, nll_loss=2.186, ppl=4.55, wps=360137, ups=0.73, wpb=494561, bsz=16271.1, num_updates=33100, lr=0.000347629, gnorm=0.183, clip=0, loss_scale=2, train_wall=119, gb_free=21.1, wall=30665 epoch 020: 1059 / 1689 loss=3.719, nll_loss=2.186, ppl=4.55, wps=360137, ups=0.73, wpb=494561, bsz=16271.1, num_updates=33100, lr=0.000347629, gnorm=0.183, clip=0, loss_scale=2, train_wall=119, gb_free=21.1, wall=30665 epoch 020: 1059 / 1689 loss=3.719, nll_loss=2.186, ppl=4.55, wps=360137, ups=0.73, wpb=494561, bsz=16271.1, num_updates=33100, lr=0.000347629, gnorm=0.183, clip=0, loss_scale=2, train_wall=119, gb_free=21.1, wall=30665 epoch 020: 1059 / 1689 loss=3.719, nll_loss=2.186, ppl=4.55, wps=360137, ups=0.73, wpb=494561, bsz=16271.1, num_updates=33100, lr=0.000347629, gnorm=0.183, clip=0, loss_scale=2, train_wall=119, gb_free=21.1, wall=30665 epoch 020: 1059 / 1689 loss=3.719, nll_loss=2.186, ppl=4.55, wps=360137, ups=0.73, wpb=494561, bsz=16271.1, num_updates=33100, lr=0.000347629, gnorm=0.183, clip=0, loss_scale=2, train_wall=119, gb_free=21.1, wall=30665 epoch 020: 1059 / 1689 loss=3.719, nll_loss=2.186, ppl=4.55, wps=360137, ups=0.73, wpb=494561, bsz=16271.1, num_updates=33100, lr=0.000347629, gnorm=0.183, clip=0, loss_scale=2, train_wall=119, gb_free=21.1, wall=30665 epoch 020: 1059 / 1689 loss=3.719, nll_loss=2.186, ppl=4.55, wps=360137, ups=0.73, wpb=494561, bsz=16271.1, num_updates=33100, lr=0.000347629, gnorm=0.183, clip=0, loss_scale=2, train_wall=119, gb_free=21.1, wall=30665 epoch 020: 1059 / 1689 loss=3.719, nll_loss=2.186, ppl=4.55, wps=360137, ups=0.73, wpb=494561, bsz=16271.1, num_updates=33100, lr=0.000347629, gnorm=0.183, clip=0, loss_scale=2, train_wall=119, gb_free=21.1, wall=30665 epoch 020: 1059 / 1689 loss=3.719, nll_loss=2.186, ppl=4.55, wps=360137, ups=0.73, wpb=494561, bsz=16271.1, num_updates=33100, lr=0.000347629, gnorm=0.183, clip=0, loss_scale=2, train_wall=119, gb_free=21.1, wall=30665 epoch 020: 1059 / 1689 loss=3.719, nll_loss=2.186, ppl=4.55, wps=360137, ups=0.73, wpb=494561, bsz=16271.1, num_updates=33100, lr=0.000347629, gnorm=0.183, clip=0, loss_scale=2, train_wall=119, gb_free=21.1, wall=30665 epoch 020: 1059 / 1689 loss=3.719, nll_loss=2.186, ppl=4.55, wps=360137, ups=0.73, wpb=494561, bsz=16271.1, num_updates=33100, lr=0.000347629, gnorm=0.183, clip=0, loss_scale=2, train_wall=119, gb_free=21.1, wall=30665 epoch 020: 1059 / 1689 loss=3.719, nll_loss=2.186, ppl=4.55, wps=360137, ups=0.73, wpb=494561, bsz=16271.1, num_updates=33100, lr=0.000347629, gnorm=0.183, clip=0, loss_scale=2, train_wall=119, gb_free=21.1, wall=30665 epoch 020: 1059 / 1689 loss=3.719, nll_loss=2.186, ppl=4.55, wps=360137, ups=0.73, wpb=494561, bsz=16271.1, num_updates=33100, lr=0.000347629, gnorm=0.183, clip=0, loss_scale=2, train_wall=119, gb_free=21.1, wall=30665 epoch 020: 1059 / 1689 loss=3.719, nll_loss=2.186, ppl=4.55, wps=360137, ups=0.73, wpb=494561, bsz=16271.1, num_updates=33100, lr=0.000347629, gnorm=0.183, clip=0, loss_scale=2, train_wall=119, gb_free=21.1, wall=30665 epoch 020: 1059 / 1689 loss=3.719, nll_loss=2.186, ppl=4.55, wps=360137, ups=0.73, wpb=494561, bsz=16271.1, num_updates=33100, lr=0.000347629, gnorm=0.183, clip=0, loss_scale=2, train_wall=119, gb_free=21.1, wall=30665 epoch 020: 1059 / 1689 loss=3.719, nll_loss=2.186, ppl=4.55, wps=360137, ups=0.73, wpb=494561, bsz=16271.1, num_updates=33100, lr=0.000347629, gnorm=0.183, clip=0, loss_scale=2, train_wall=119, gb_free=21.1, wall=30665 epoch 020: 1159 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=546619, ups=1.1, wpb=495270, bsz=16625.7, num_updates=33200, lr=0.000347105, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=20.2, wall=30755 epoch 020: 1159 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=546619, ups=1.1, wpb=495270, bsz=16625.7, num_updates=33200, lr=0.000347105, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=20.2, wall=30755 epoch 020: 1159 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=546619, ups=1.1, wpb=495270, bsz=16625.7, num_updates=33200, lr=0.000347105, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=20.2, wall=30755 epoch 020: 1159 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=546619, ups=1.1, wpb=495270, bsz=16625.7, num_updates=33200, lr=0.000347105, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=20.2, wall=30755 epoch 020: 1159 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=546619, ups=1.1, wpb=495270, bsz=16625.7, num_updates=33200, lr=0.000347105, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=20.2, wall=30755 epoch 020: 1159 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=546619, ups=1.1, wpb=495270, bsz=16625.7, num_updates=33200, lr=0.000347105, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=20.2, wall=30755 epoch 020: 1159 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=546619, ups=1.1, wpb=495270, bsz=16625.7, num_updates=33200, lr=0.000347105, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=20.2, wall=30755 epoch 020: 1159 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=546619, ups=1.1, wpb=495270, bsz=16625.7, num_updates=33200, lr=0.000347105, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=20.2, wall=30755 epoch 020: 1159 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=546619, ups=1.1, wpb=495270, bsz=16625.7, num_updates=33200, lr=0.000347105, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=20.2, wall=30755 epoch 020: 1159 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=546619, ups=1.1, wpb=495270, bsz=16625.7, num_updates=33200, lr=0.000347105, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=20.2, wall=30755 epoch 020: 1159 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=546619, ups=1.1, wpb=495270, bsz=16625.7, num_updates=33200, lr=0.000347105, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=20.2, wall=30755 epoch 020: 1159 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=546619, ups=1.1, wpb=495270, bsz=16625.7, num_updates=33200, lr=0.000347105, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=20.2, wall=30755 epoch 020: 1159 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=546619, ups=1.1, wpb=495270, bsz=16625.7, num_updates=33200, lr=0.000347105, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=20.2, wall=30755 epoch 020: 1159 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=546619, ups=1.1, wpb=495270, bsz=16625.7, num_updates=33200, lr=0.000347105, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=20.2, wall=30755 epoch 020: 1159 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=546619, ups=1.1, wpb=495270, bsz=16625.7, num_updates=33200, lr=0.000347105, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=20.2, wall=30755 epoch 020: 1159 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=546619, ups=1.1, wpb=495270, bsz=16625.7, num_updates=33200, lr=0.000347105, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=20.2, wall=30755 epoch 020: 1159 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=546619, ups=1.1, wpb=495270, bsz=16625.7, num_updates=33200, lr=0.000347105, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=20.2, wall=30755 epoch 020: 1159 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=546619, ups=1.1, wpb=495270, bsz=16625.7, num_updates=33200, lr=0.000347105, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=20.2, wall=30755 epoch 020: 1159 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=546619, ups=1.1, wpb=495270, bsz=16625.7, num_updates=33200, lr=0.000347105, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=20.2, wall=30755 epoch 020: 1159 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=546619, ups=1.1, wpb=495270, bsz=16625.7, num_updates=33200, lr=0.000347105, gnorm=0.175, clip=0, loss_scale=4, train_wall=89, gb_free=20.2, wall=30755 epoch 020: 1259 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=550868, ups=1.11, wpb=495092, bsz=16664.6, num_updates=33300, lr=0.000346583, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=30845 epoch 020: 1259 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=550868, ups=1.11, wpb=495092, bsz=16664.6, num_updates=33300, lr=0.000346583, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=30845 epoch 020: 1259 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=550868, ups=1.11, wpb=495092, bsz=16664.6, num_updates=33300, lr=0.000346583, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=30845 epoch 020: 1259 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=550868, ups=1.11, wpb=495092, bsz=16664.6, num_updates=33300, lr=0.000346583, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=30845 epoch 020: 1259 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=550868, ups=1.11, wpb=495092, bsz=16664.6, num_updates=33300, lr=0.000346583, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=30845 epoch 020: 1259 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=550868, ups=1.11, wpb=495092, bsz=16664.6, num_updates=33300, lr=0.000346583, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=30845 epoch 020: 1259 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=550868, ups=1.11, wpb=495092, bsz=16664.6, num_updates=33300, lr=0.000346583, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=30845 epoch 020: 1259 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=550868, ups=1.11, wpb=495092, bsz=16664.6, num_updates=33300, lr=0.000346583, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=30845 epoch 020: 1259 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=550868, ups=1.11, wpb=495092, bsz=16664.6, num_updates=33300, lr=0.000346583, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=30845 epoch 020: 1259 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=550868, ups=1.11, wpb=495092, bsz=16664.6, num_updates=33300, lr=0.000346583, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=30845 epoch 020: 1259 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=550868, ups=1.11, wpb=495092, bsz=16664.6, num_updates=33300, lr=0.000346583, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=30845 epoch 020: 1259 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=550868, ups=1.11, wpb=495092, bsz=16664.6, num_updates=33300, lr=0.000346583, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=30845 epoch 020: 1259 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=550868, ups=1.11, wpb=495092, bsz=16664.6, num_updates=33300, lr=0.000346583, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=30845 epoch 020: 1259 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=550868, ups=1.11, wpb=495092, bsz=16664.6, num_updates=33300, lr=0.000346583, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=30845 epoch 020: 1259 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=550868, ups=1.11, wpb=495092, bsz=16664.6, num_updates=33300, lr=0.000346583, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=30845 epoch 020: 1259 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=550868, ups=1.11, wpb=495092, bsz=16664.6, num_updates=33300, lr=0.000346583, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=30845 epoch 020: 1259 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=550868, ups=1.11, wpb=495092, bsz=16664.6, num_updates=33300, lr=0.000346583, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=30845 epoch 020: 1259 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=550868, ups=1.11, wpb=495092, bsz=16664.6, num_updates=33300, lr=0.000346583, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=30845 epoch 020: 1259 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=550868, ups=1.11, wpb=495092, bsz=16664.6, num_updates=33300, lr=0.000346583, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=30845 epoch 020: 1259 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=550868, ups=1.11, wpb=495092, bsz=16664.6, num_updates=33300, lr=0.000346583, gnorm=0.177, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=30845 epoch 020: 1359 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=550237, ups=1.11, wpb=494922, bsz=16408.9, num_updates=33400, lr=0.000346064, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=30935 epoch 020: 1359 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=550237, ups=1.11, wpb=494922, bsz=16408.9, num_updates=33400, lr=0.000346064, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=30935 epoch 020: 1359 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=550237, ups=1.11, wpb=494922, bsz=16408.9, num_updates=33400, lr=0.000346064, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=30935 epoch 020: 1359 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=550237, ups=1.11, wpb=494922, bsz=16408.9, num_updates=33400, lr=0.000346064, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=30935 epoch 020: 1359 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=550237, ups=1.11, wpb=494922, bsz=16408.9, num_updates=33400, lr=0.000346064, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=30935 epoch 020: 1359 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=550237, ups=1.11, wpb=494922, bsz=16408.9, num_updates=33400, lr=0.000346064, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=30935 epoch 020: 1359 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=550237, ups=1.11, wpb=494922, bsz=16408.9, num_updates=33400, lr=0.000346064, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=30935 epoch 020: 1359 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=550237, ups=1.11, wpb=494922, bsz=16408.9, num_updates=33400, lr=0.000346064, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=30935 epoch 020: 1359 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=550237, ups=1.11, wpb=494922, bsz=16408.9, num_updates=33400, lr=0.000346064, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=30935 epoch 020: 1359 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=550237, ups=1.11, wpb=494922, bsz=16408.9, num_updates=33400, lr=0.000346064, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=30935 epoch 020: 1359 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=550237, ups=1.11, wpb=494922, bsz=16408.9, num_updates=33400, lr=0.000346064, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=30935 epoch 020: 1359 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=550237, ups=1.11, wpb=494922, bsz=16408.9, num_updates=33400, lr=0.000346064, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=30935 epoch 020: 1359 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=550237, ups=1.11, wpb=494922, bsz=16408.9, num_updates=33400, lr=0.000346064, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=30935 epoch 020: 1359 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=550237, ups=1.11, wpb=494922, bsz=16408.9, num_updates=33400, lr=0.000346064, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=30935 epoch 020: 1359 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=550237, ups=1.11, wpb=494922, bsz=16408.9, num_updates=33400, lr=0.000346064, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=30935 epoch 020: 1359 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=550237, ups=1.11, wpb=494922, bsz=16408.9, num_updates=33400, lr=0.000346064, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=30935 epoch 020: 1359 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=550237, ups=1.11, wpb=494922, bsz=16408.9, num_updates=33400, lr=0.000346064, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=30935 epoch 020: 1359 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=550237, ups=1.11, wpb=494922, bsz=16408.9, num_updates=33400, lr=0.000346064, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=30935 epoch 020: 1359 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=550237, ups=1.11, wpb=494922, bsz=16408.9, num_updates=33400, lr=0.000346064, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=30935 epoch 020: 1359 / 1689 loss=3.728, nll_loss=2.197, ppl=4.58, wps=550237, ups=1.11, wpb=494922, bsz=16408.9, num_updates=33400, lr=0.000346064, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=30935 epoch 020: 1459 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=548237, ups=1.11, wpb=494988, bsz=16524.2, num_updates=33500, lr=0.000345547, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=31025 epoch 020: 1459 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=548237, ups=1.11, wpb=494988, bsz=16524.2, num_updates=33500, lr=0.000345547, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=31025 epoch 020: 1459 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=548237, ups=1.11, wpb=494988, bsz=16524.2, num_updates=33500, lr=0.000345547, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=31025 epoch 020: 1459 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=548237, ups=1.11, wpb=494988, bsz=16524.2, num_updates=33500, lr=0.000345547, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=31025 epoch 020: 1459 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=548237, ups=1.11, wpb=494988, bsz=16524.2, num_updates=33500, lr=0.000345547, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=31025 epoch 020: 1459 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=548237, ups=1.11, wpb=494988, bsz=16524.2, num_updates=33500, lr=0.000345547, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=31025 epoch 020: 1459 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=548237, ups=1.11, wpb=494988, bsz=16524.2, num_updates=33500, lr=0.000345547, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=31025 epoch 020: 1459 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=548237, ups=1.11, wpb=494988, bsz=16524.2, num_updates=33500, lr=0.000345547, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=31025 epoch 020: 1459 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=548237, ups=1.11, wpb=494988, bsz=16524.2, num_updates=33500, lr=0.000345547, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=31025 epoch 020: 1459 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=548237, ups=1.11, wpb=494988, bsz=16524.2, num_updates=33500, lr=0.000345547, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=31025 epoch 020: 1459 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=548237, ups=1.11, wpb=494988, bsz=16524.2, num_updates=33500, lr=0.000345547, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=31025 epoch 020: 1459 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=548237, ups=1.11, wpb=494988, bsz=16524.2, num_updates=33500, lr=0.000345547, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=31025 epoch 020: 1459 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=548237, ups=1.11, wpb=494988, bsz=16524.2, num_updates=33500, lr=0.000345547, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=31025 epoch 020: 1459 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=548237, ups=1.11, wpb=494988, bsz=16524.2, num_updates=33500, lr=0.000345547, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=31025 epoch 020: 1459 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=548237, ups=1.11, wpb=494988, bsz=16524.2, num_updates=33500, lr=0.000345547, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=31025 epoch 020: 1459 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=548237, ups=1.11, wpb=494988, bsz=16524.2, num_updates=33500, lr=0.000345547, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=31025 epoch 020: 1459 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=548237, ups=1.11, wpb=494988, bsz=16524.2, num_updates=33500, lr=0.000345547, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=31025 epoch 020: 1459 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=548237, ups=1.11, wpb=494988, bsz=16524.2, num_updates=33500, lr=0.000345547, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=31025 epoch 020: 1459 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=548237, ups=1.11, wpb=494988, bsz=16524.2, num_updates=33500, lr=0.000345547, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=31025 epoch 020: 1459 / 1689 loss=3.725, nll_loss=2.193, ppl=4.57, wps=548237, ups=1.11, wpb=494988, bsz=16524.2, num_updates=33500, lr=0.000345547, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=31025 epoch 020: 1559 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=548906, ups=1.11, wpb=494677, bsz=16243, num_updates=33600, lr=0.000345033, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=31116 epoch 020: 1559 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=548906, ups=1.11, wpb=494677, bsz=16243, num_updates=33600, lr=0.000345033, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=31116 epoch 020: 1559 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=548906, ups=1.11, wpb=494677, bsz=16243, num_updates=33600, lr=0.000345033, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=31116 epoch 020: 1559 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=548906, ups=1.11, wpb=494677, bsz=16243, num_updates=33600, lr=0.000345033, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=31116 epoch 020: 1559 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=548906, ups=1.11, wpb=494677, bsz=16243, num_updates=33600, lr=0.000345033, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=31116 epoch 020: 1559 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=548906, ups=1.11, wpb=494677, bsz=16243, num_updates=33600, lr=0.000345033, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=31116 epoch 020: 1559 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=548906, ups=1.11, wpb=494677, bsz=16243, num_updates=33600, lr=0.000345033, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=31116 epoch 020: 1559 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=548906, ups=1.11, wpb=494677, bsz=16243, num_updates=33600, lr=0.000345033, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=31116 epoch 020: 1559 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=548906, ups=1.11, wpb=494677, bsz=16243, num_updates=33600, lr=0.000345033, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=31116 epoch 020: 1559 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=548906, ups=1.11, wpb=494677, bsz=16243, num_updates=33600, lr=0.000345033, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=31116 epoch 020: 1559 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=548906, ups=1.11, wpb=494677, bsz=16243, num_updates=33600, lr=0.000345033, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=31116 epoch 020: 1559 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=548906, ups=1.11, wpb=494677, bsz=16243, num_updates=33600, lr=0.000345033, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=31116 epoch 020: 1559 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=548906, ups=1.11, wpb=494677, bsz=16243, num_updates=33600, lr=0.000345033, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=31116 epoch 020: 1559 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=548906, ups=1.11, wpb=494677, bsz=16243, num_updates=33600, lr=0.000345033, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=31116 epoch 020: 1559 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=548906, ups=1.11, wpb=494677, bsz=16243, num_updates=33600, lr=0.000345033, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=31116 epoch 020: 1559 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=548906, ups=1.11, wpb=494677, bsz=16243, num_updates=33600, lr=0.000345033, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=31116 epoch 020: 1559 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=548906, ups=1.11, wpb=494677, bsz=16243, num_updates=33600, lr=0.000345033, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=31116 epoch 020: 1559 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=548906, ups=1.11, wpb=494677, bsz=16243, num_updates=33600, lr=0.000345033, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=31116 epoch 020: 1559 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=548906, ups=1.11, wpb=494677, bsz=16243, num_updates=33600, lr=0.000345033, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=31116 epoch 020: 1559 / 1689 loss=3.724, nll_loss=2.192, ppl=4.57, wps=548906, ups=1.11, wpb=494677, bsz=16243, num_updates=33600, lr=0.000345033, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=31116 epoch 020: 1659 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=548857, ups=1.11, wpb=496692, bsz=16565.1, num_updates=33700, lr=0.00034452, gnorm=0.175, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=31206 epoch 020: 1659 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=548857, ups=1.11, wpb=496692, bsz=16565.1, num_updates=33700, lr=0.00034452, gnorm=0.175, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=31206 epoch 020: 1659 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=548857, ups=1.11, wpb=496692, bsz=16565.1, num_updates=33700, lr=0.00034452, gnorm=0.175, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=31206 epoch 020: 1659 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=548857, ups=1.11, wpb=496692, bsz=16565.1, num_updates=33700, lr=0.00034452, gnorm=0.175, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=31206 epoch 020: 1659 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=548857, ups=1.11, wpb=496692, bsz=16565.1, num_updates=33700, lr=0.00034452, gnorm=0.175, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=31206 epoch 020: 1659 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=548857, ups=1.11, wpb=496692, bsz=16565.1, num_updates=33700, lr=0.00034452, gnorm=0.175, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=31206 epoch 020: 1659 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=548857, ups=1.11, wpb=496692, bsz=16565.1, num_updates=33700, lr=0.00034452, gnorm=0.175, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=31206 epoch 020: 1659 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=548857, ups=1.11, wpb=496692, bsz=16565.1, num_updates=33700, lr=0.00034452, gnorm=0.175, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=31206 epoch 020: 1659 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=548857, ups=1.11, wpb=496692, bsz=16565.1, num_updates=33700, lr=0.00034452, gnorm=0.175, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=31206 epoch 020: 1659 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=548857, ups=1.11, wpb=496692, bsz=16565.1, num_updates=33700, lr=0.00034452, gnorm=0.175, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=31206 epoch 020: 1659 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=548857, ups=1.11, wpb=496692, bsz=16565.1, num_updates=33700, lr=0.00034452, gnorm=0.175, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=31206 epoch 020: 1659 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=548857, ups=1.11, wpb=496692, bsz=16565.1, num_updates=33700, lr=0.00034452, gnorm=0.175, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=31206 epoch 020: 1659 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=548857, ups=1.11, wpb=496692, bsz=16565.1, num_updates=33700, lr=0.00034452, gnorm=0.175, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=31206 epoch 020: 1659 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=548857, ups=1.11, wpb=496692, bsz=16565.1, num_updates=33700, lr=0.00034452, gnorm=0.175, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=31206 epoch 020: 1659 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=548857, ups=1.11, wpb=496692, bsz=16565.1, num_updates=33700, lr=0.00034452, gnorm=0.175, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=31206 epoch 020: 1659 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=548857, ups=1.11, wpb=496692, bsz=16565.1, num_updates=33700, lr=0.00034452, gnorm=0.175, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=31206 epoch 020: 1659 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=548857, ups=1.11, wpb=496692, bsz=16565.1, num_updates=33700, lr=0.00034452, gnorm=0.175, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=31206 epoch 020: 1659 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=548857, ups=1.11, wpb=496692, bsz=16565.1, num_updates=33700, lr=0.00034452, gnorm=0.175, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=31206 epoch 020: 1659 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=548857, ups=1.11, wpb=496692, bsz=16565.1, num_updates=33700, lr=0.00034452, gnorm=0.175, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=31206 epoch 020: 1659 / 1689 loss=3.721, nll_loss=2.189, ppl=4.56, wps=548857, ups=1.11, wpb=496692, bsz=16565.1, num_updates=33700, lr=0.00034452, gnorm=0.175, clip=0, loss_scale=8, train_wall=88, gb_free=22, wall=31206 end of epoch 20 (average epoch stats below) epoch 020 | loss 3.719 | nll_loss 2.187 | ppl 4.55 | wps 533065 | ups 1.08 | wpb 495115 | bsz 16504.8 | num_updates 33729 | lr 0.000344372 | gnorm 0.176 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 23.5 | wall 31232 epoch 020 | loss 3.719 | nll_loss 2.187 | ppl 4.55 | wps 533065 | ups 1.08 | wpb 495115 | bsz 16504.8 | num_updates 33729 | lr 0.000344372 | gnorm 0.176 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 23.5 | wall 31232 epoch 020 | loss 3.719 | nll_loss 2.187 | ppl 4.55 | wps 533065 | ups 1.08 | wpb 495115 | bsz 16504.8 | num_updates 33729 | lr 0.000344372 | gnorm 0.176 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 23.5 | wall 31232 epoch 020 | loss 3.719 | nll_loss 2.187 | ppl 4.55 | wps 533065 | ups 1.08 | wpb 495115 | bsz 16504.8 | num_updates 33729 | lr 0.000344372 | gnorm 0.176 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 23.5 | wall 31232 epoch 020 | loss 3.719 | nll_loss 2.187 | ppl 4.55 | wps 533065 | ups 1.08 | wpb 495115 | bsz 16504.8 | num_updates 33729 | lr 0.000344372 | gnorm 0.176 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 23.5 | wall 31232 epoch 020 | loss 3.719 | nll_loss 2.187 | ppl 4.55 | wps 533065 | ups 1.08 | wpb 495115 | bsz 16504.8 | num_updates 33729 | lr 0.000344372 | gnorm 0.176 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 23.5 | wall 31232 epoch 020 | loss 3.719 | nll_loss 2.187 | ppl 4.55 | wps 533065 | ups 1.08 | wpb 495115 | bsz 16504.8 | num_updates 33729 | lr 0.000344372 | gnorm 0.176 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 23.5 | wall 31232 epoch 020 | loss 3.719 | nll_loss 2.187 | ppl 4.55 | wps 533065 | ups 1.08 | wpb 495115 | bsz 16504.8 | num_updates 33729 | lr 0.000344372 | gnorm 0.176 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 23.5 | wall 31232 epoch 020 | loss 3.719 | nll_loss 2.187 | ppl 4.55 | wps 533065 | ups 1.08 | wpb 495115 | bsz 16504.8 | num_updates 33729 | lr 0.000344372 | gnorm 0.176 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 23.5 | wall 31232 epoch 020 | loss 3.719 | nll_loss 2.187 | ppl 4.55 | wps 533065 | ups 1.08 | wpb 495115 | bsz 16504.8 | num_updates 33729 | lr 0.000344372 | gnorm 0.176 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 23.5 | wall 31232 epoch 020 | loss 3.719 | nll_loss 2.187 | ppl 4.55 | wps 533065 | ups 1.08 | wpb 495115 | bsz 16504.8 | num_updates 33729 | lr 0.000344372 | gnorm 0.176 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 23.5 | wall 31232 epoch 020 | loss 3.719 | nll_loss 2.187 | ppl 4.55 | wps 533065 | ups 1.08 | wpb 495115 | bsz 16504.8 | num_updates 33729 | lr 0.000344372 | gnorm 0.176 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 23.5 | wall 31232 epoch 020 | loss 3.719 | nll_loss 2.187 | ppl 4.55 | wps 533065 | ups 1.08 | wpb 495115 | bsz 16504.8 | num_updates 33729 | lr 0.000344372 | gnorm 0.176 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 23.5 | wall 31232 epoch 020 | loss 3.719 | nll_loss 2.187 | ppl 4.55 | wps 533065 | ups 1.08 | wpb 495115 | bsz 16504.8 | num_updates 33729 | lr 0.000344372 | gnorm 0.176 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 23.5 | wall 31232 epoch 020 | loss 3.719 | nll_loss 2.187 | ppl 4.55 | wps 533065 | ups 1.08 | wpb 495115 | bsz 16504.8 | num_updates 33729 | lr 0.000344372 | gnorm 0.176 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 23.5 | wall 31232 epoch 020 | loss 3.719 | nll_loss 2.187 | ppl 4.55 | wps 533065 | ups 1.08 | wpb 495115 | bsz 16504.8 | num_updates 33729 | lr 0.000344372 | gnorm 0.176 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 23.5 | wall 31232 epoch 020 | loss 3.719 | nll_loss 2.187 | ppl 4.55 | wps 533065 | ups 1.08 | wpb 495115 | bsz 16504.8 | num_updates 33729 | lr 0.000344372 | gnorm 0.176 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 23.5 | wall 31232 epoch 020 | loss 3.719 | nll_loss 2.187 | ppl 4.55 | wps 533065 | ups 1.08 | wpb 495115 | bsz 16504.8 | num_updates 33729 | lr 0.000344372 | gnorm 0.176 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 23.5 | wall 31232 epoch 020 | loss 3.719 | nll_loss 2.187 | ppl 4.55 | wps 533065 | ups 1.08 | wpb 495115 | bsz 16504.8 | num_updates 33729 | lr 0.000344372 | gnorm 0.176 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 23.5 | wall 31232 epoch 020 | loss 3.719 | nll_loss 2.187 | ppl 4.55 | wps 533065 | ups 1.08 | wpb 495115 | bsz 16504.8 | num_updates 33729 | lr 0.000344372 | gnorm 0.176 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 23.5 | wall 31232 Start iterating over samples epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 71 / 1689 loss=3.708, nll_loss=2.173, ppl=4.51, wps=533074, ups=1.08, wpb=491987, bsz=16267.3, num_updates=33800, lr=0.00034401, gnorm=0.181, clip=0, loss_scale=4, train_wall=89, gb_free=22.4, wall=31298 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 171 / 1689 loss=3.707, nll_loss=2.172, ppl=4.51, wps=546282, ups=1.1, wpb=495090, bsz=16438.2, num_updates=33900, lr=0.000343503, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=22.2, wall=31389 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 epoch 021: 273 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=535935, ups=1.08, wpb=495329, bsz=16540.3, num_updates=34000, lr=0.000342997, gnorm=0.171, clip=0, loss_scale=1, train_wall=90, gb_free=21.9, wall=31481 begin validation on "valid" subset epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021 | valid on 'valid' subset | loss 3.73 | nll_loss 2.165 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 34000 | best_loss 3.73 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 373 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=454108, ups=0.92, wpb=494054, bsz=16774.6, num_updates=34100, lr=0.000342494, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31590 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 473 / 1689 loss=3.711, nll_loss=2.177, ppl=4.52, wps=546502, ups=1.1, wpb=494689, bsz=16751.4, num_updates=34200, lr=0.000341993, gnorm=0.174, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=31681 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 573 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=553024, ups=1.12, wpb=494700, bsz=16606.2, num_updates=34300, lr=0.000341494, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=31770 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 673 / 1689 loss=3.718, nll_loss=2.185, ppl=4.55, wps=549875, ups=1.11, wpb=495287, bsz=16496, num_updates=34400, lr=0.000340997, gnorm=0.173, clip=0, loss_scale=1, train_wall=88, gb_free=21.6, wall=31860 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 773 / 1689 loss=3.72, nll_loss=2.188, ppl=4.56, wps=547696, ups=1.11, wpb=494992, bsz=16660.7, num_updates=34500, lr=0.000340503, gnorm=0.184, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=31951 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 873 / 1689 loss=3.712, nll_loss=2.178, ppl=4.53, wps=548453, ups=1.11, wpb=495201, bsz=16241.3, num_updates=34600, lr=0.00034001, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=32041 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 973 / 1689 loss=3.71, nll_loss=2.176, ppl=4.52, wps=551353, ups=1.11, wpb=496588, bsz=16303.4, num_updates=34700, lr=0.00033952, gnorm=0.184, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=32131 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1073 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=546288, ups=1.1, wpb=494941, bsz=16449.5, num_updates=34800, lr=0.000339032, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=32222 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1173 / 1689 loss=3.717, nll_loss=2.184, ppl=4.54, wps=549747, ups=1.11, wpb=495417, bsz=16739.1, num_updates=34900, lr=0.000338546, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=32312 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 epoch 021: 1273 / 1689 loss=3.716, nll_loss=2.183, ppl=4.54, wps=545644, ups=1.1, wpb=494659, bsz=16610.1, num_updates=35000, lr=0.000338062, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=32402 begin validation on "valid" subset epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021 | valid on 'valid' subset | loss 3.726 | nll_loss 2.164 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 35000 | best_loss 3.726 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1373 / 1689 loss=3.717, nll_loss=2.185, ppl=4.55, wps=396389, ups=0.8, wpb=495472, bsz=16203.2, num_updates=35100, lr=0.00033758, gnorm=0.177, clip=0, loss_scale=4, train_wall=100, gb_free=21.7, wall=32527 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1473 / 1689 loss=3.715, nll_loss=2.183, ppl=4.54, wps=555415, ups=1.12, wpb=497399, bsz=16325.3, num_updates=35200, lr=0.0003371, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=32617 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1573 / 1689 loss=3.722, nll_loss=2.19, ppl=4.56, wps=552035, ups=1.11, wpb=496386, bsz=16494.9, num_updates=35300, lr=0.000336622, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=32707 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 epoch 021: 1674 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=549961, ups=1.11, wpb=494526, bsz=16490.2, num_updates=35400, lr=0.000336146, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=32797 end of epoch 21 (average epoch stats below) epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 epoch 021 | loss 3.713 | nll_loss 2.18 | ppl 4.53 | wps 529279 | ups 1.07 | wpb 495106 | bsz 16506.2 | num_updates 35415 | lr 0.000336075 | gnorm 0.174 | clip 0 | loss_scale 2 | train_wall 1505 | gb_free 23.8 | wall 32810 Start iterating over samples epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 85 / 1689 loss=3.696, nll_loss=2.16, ppl=4.47, wps=550484, ups=1.12, wpb=491104, bsz=16607.2, num_updates=35500, lr=0.000335673, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=32886 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 185 / 1689 loss=3.697, nll_loss=2.161, ppl=4.47, wps=547655, ups=1.11, wpb=493534, bsz=15971.9, num_updates=35600, lr=0.000335201, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=32976 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 285 / 1689 loss=3.703, nll_loss=2.168, ppl=4.49, wps=546972, ups=1.11, wpb=493998, bsz=16699.6, num_updates=35700, lr=0.000334731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=33066 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 385 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551669, ups=1.11, wpb=495910, bsz=16766.1, num_updates=35800, lr=0.000334263, gnorm=0.181, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=33156 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 485 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=547264, ups=1.1, wpb=495846, bsz=16478.7, num_updates=35900, lr=0.000333797, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=33247 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 epoch 022: 586 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=544970, ups=1.1, wpb=495377, bsz=16717.7, num_updates=36000, lr=0.000333333, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=33338 begin validation on "valid" subset epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022 | valid on 'valid' subset | loss 3.728 | nll_loss 2.165 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 36000 | best_loss 3.726 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 686 / 1689 loss=3.708, nll_loss=2.174, ppl=4.51, wps=488514, ups=0.98, wpb=496002, bsz=16479.2, num_updates=36100, lr=0.000332871, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=33439 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 786 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=554522, ups=1.12, wpb=494467, bsz=16245.6, num_updates=36200, lr=0.000332411, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=33529 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 886 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=545540, ups=1.1, wpb=494522, bsz=16716.6, num_updates=36300, lr=0.000331953, gnorm=0.177, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=33619 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 986 / 1689 loss=3.713, nll_loss=2.18, ppl=4.53, wps=554564, ups=1.12, wpb=496327, bsz=16623, num_updates=36400, lr=0.000331497, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=33709 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1086 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550383, ups=1.11, wpb=495100, bsz=16589.9, num_updates=36500, lr=0.000331042, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=20.9, wall=33799 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1186 / 1689 loss=3.714, nll_loss=2.182, ppl=4.54, wps=559616, ups=1.13, wpb=495800, bsz=16306, num_updates=36600, lr=0.00033059, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=33887 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1286 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=558369, ups=1.12, wpb=497215, bsz=16490.8, num_updates=36700, lr=0.000330139, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=33976 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1386 / 1689 loss=3.716, nll_loss=2.184, ppl=4.54, wps=553671, ups=1.12, wpb=495756, bsz=16610.7, num_updates=36800, lr=0.00032969, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=34066 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1486 / 1689 loss=3.714, nll_loss=2.181, ppl=4.54, wps=555495, ups=1.12, wpb=495107, bsz=16317.4, num_updates=36900, lr=0.000329243, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=34155 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 epoch 022: 1588 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=540786, ups=1.09, wpb=495531, bsz=16584.2, num_updates=37000, lr=0.000328798, gnorm=0.172, clip=0, loss_scale=1, train_wall=91, gb_free=22.1, wall=34247 begin validation on "valid" subset epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022 | valid on 'valid' subset | loss 3.723 | nll_loss 2.16 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 37000 | best_loss 3.723 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 epoch 022: 1688 / 1689 loss=3.711, nll_loss=2.178, ppl=4.53, wps=456271, ups=0.92, wpb=495407, bsz=16520.2, num_updates=37100, lr=0.000328355, gnorm=0.178, clip=0, loss_scale=1, train_wall=89, gb_free=22, wall=34355 end of epoch 22 (average epoch stats below) epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 epoch 022 | loss 3.708 | nll_loss 2.174 | ppl 4.51 | wps 539958 | ups 1.09 | wpb 495111 | bsz 16503.5 | num_updates 37101 | lr 0.00032835 | gnorm 0.174 | clip 0 | loss_scale 1 | train_wall 1495 | gb_free 21.9 | wall 34355 Start iterating over samples epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 99 / 1689 loss=3.69, nll_loss=2.154, ppl=4.45, wps=547546, ups=1.11, wpb=491664, bsz=16436.9, num_updates=37200, lr=0.000327913, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.8, wall=34445 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 199 / 1689 loss=3.693, nll_loss=2.157, ppl=4.46, wps=543713, ups=1.1, wpb=495041, bsz=17084.8, num_updates=37300, lr=0.000327473, gnorm=0.182, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=34536 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 299 / 1689 loss=3.7, nll_loss=2.165, ppl=4.49, wps=546358, ups=1.1, wpb=496656, bsz=16754.7, num_updates=37400, lr=0.000327035, gnorm=0.169, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=34627 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 399 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=546515, ups=1.1, wpb=495958, bsz=16549, num_updates=37500, lr=0.000326599, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=34718 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 499 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549001, ups=1.11, wpb=495956, bsz=16815.4, num_updates=37600, lr=0.000326164, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=34808 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 599 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=545511, ups=1.1, wpb=494195, bsz=16167.1, num_updates=37700, lr=0.000325731, gnorm=0.178, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=34899 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 699 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=554026, ups=1.12, wpb=494300, bsz=16509.4, num_updates=37800, lr=0.0003253, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=34988 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 799 / 1689 loss=3.705, nll_loss=2.171, ppl=4.5, wps=550802, ups=1.11, wpb=496251, bsz=16442.1, num_updates=37900, lr=0.000324871, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=35078 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 epoch 023: 899 / 1689 loss=3.709, nll_loss=2.176, ppl=4.52, wps=550504, ups=1.11, wpb=495077, bsz=16920.9, num_updates=38000, lr=0.000324443, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=35168 begin validation on "valid" subset epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023 | valid on 'valid' subset | loss 3.72 | nll_loss 2.157 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 38000 | best_loss 3.72 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 999 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=454015, ups=0.92, wpb=495196, bsz=16232.3, num_updates=38100, lr=0.000324017, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35277 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1099 / 1689 loss=3.71, nll_loss=2.177, ppl=4.52, wps=555616, ups=1.12, wpb=494502, bsz=16338.2, num_updates=38200, lr=0.000323592, gnorm=0.181, clip=0, loss_scale=4, train_wall=87, gb_free=22.7, wall=35366 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1200 / 1689 loss=3.704, nll_loss=2.17, ppl=4.5, wps=539857, ups=1.09, wpb=495512, bsz=16238.6, num_updates=38300, lr=0.00032317, gnorm=0.177, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=35458 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1300 / 1689 loss=3.708, nll_loss=2.175, ppl=4.52, wps=552927, ups=1.12, wpb=495296, bsz=16465.5, num_updates=38400, lr=0.000322749, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=35547 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1400 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=551567, ups=1.12, wpb=494491, bsz=16462.1, num_updates=38500, lr=0.000322329, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22.6, wall=35637 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1500 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=543070, ups=1.1, wpb=494947, bsz=16514.9, num_updates=38600, lr=0.000321911, gnorm=0.172, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=35728 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 epoch 023: 1600 / 1689 loss=3.703, nll_loss=2.169, ppl=4.5, wps=551390, ups=1.11, wpb=496410, bsz=16341, num_updates=38700, lr=0.000321495, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=35818 end of epoch 23 (average epoch stats below) epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 epoch 023 | loss 3.703 | nll_loss 2.168 | ppl 4.5 | wps 541859 | ups 1.09 | wpb 495116 | bsz 16507 | num_updates 38789 | lr 0.000321126 | gnorm 0.173 | clip 0 | loss_scale 4 | train_wall 1494 | gb_free 23.4 | wall 35898 Start iterating over samples epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 11 / 1689 loss=3.706, nll_loss=2.173, ppl=4.51, wps=534859, ups=1.09, wpb=491547, bsz=16222.3, num_updates=38800, lr=0.000321081, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=35910 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 111 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=546309, ups=1.1, wpb=496371, bsz=16716.5, num_updates=38900, lr=0.000320668, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36001 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 epoch 024: 211 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=543607, ups=1.1, wpb=496096, bsz=16803.7, num_updates=39000, lr=0.000320256, gnorm=0.183, clip=0, loss_scale=4, train_wall=89, gb_free=21.5, wall=36092 begin validation on "valid" subset epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.741 | nll_loss 2.183 | ppl 4.54 | wps 0 | wpb 44526 | bsz 2008 | num_updates 39000 | best_loss 3.72 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 311 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=481989, ups=0.97, wpb=495325, bsz=16298, num_updates=39100, lr=0.000319847, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=36195 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 411 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=549444, ups=1.11, wpb=494807, bsz=16314.2, num_updates=39200, lr=0.000319438, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=36285 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 512 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=543040, ups=1.09, wpb=496623, bsz=16663.4, num_updates=39300, lr=0.000319032, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=36377 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 612 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=544973, ups=1.1, wpb=495216, bsz=16362.6, num_updates=39400, lr=0.000318626, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=36467 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 712 / 1689 loss=3.692, nll_loss=2.156, ppl=4.46, wps=547999, ups=1.11, wpb=494088, bsz=16346.7, num_updates=39500, lr=0.000318223, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=36558 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 812 / 1689 loss=3.698, nll_loss=2.163, ppl=4.48, wps=544210, ups=1.1, wpb=496505, bsz=16385.8, num_updates=39600, lr=0.000317821, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=22.8, wall=36649 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 914 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=532434, ups=1.07, wpb=495506, bsz=16517, num_updates=39700, lr=0.00031742, gnorm=0.17, clip=0, loss_scale=1, train_wall=92, gb_free=22.3, wall=36742 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1014 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=551424, ups=1.11, wpb=495421, bsz=16530.2, num_updates=39800, lr=0.000317021, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=21.4, wall=36832 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1114 / 1689 loss=3.699, nll_loss=2.165, ppl=4.48, wps=547299, ups=1.1, wpb=495865, bsz=16898.1, num_updates=39900, lr=0.000316624, gnorm=0.172, clip=0, loss_scale=1, train_wall=90, gb_free=21.6, wall=36922 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 epoch 024: 1214 / 1689 loss=3.696, nll_loss=2.162, ppl=4.47, wps=548192, ups=1.11, wpb=494599, bsz=16687.8, num_updates=40000, lr=0.000316228, gnorm=0.18, clip=0, loss_scale=1, train_wall=90, gb_free=21.4, wall=37013 begin validation on "valid" subset epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024 | valid on 'valid' subset | loss 3.721 | nll_loss 2.159 | ppl 4.47 | wps 0 | wpb 44526 | bsz 2008 | num_updates 40000 | best_loss 3.72 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1314 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=419419, ups=0.85, wpb=494659, bsz=16573.2, num_updates=40100, lr=0.000315833, gnorm=0.171, clip=0, loss_scale=1, train_wall=99, gb_free=20.7, wall=37130 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1414 / 1689 loss=3.701, nll_loss=2.167, ppl=4.49, wps=549789, ups=1.11, wpb=493950, bsz=16139.3, num_updates=40200, lr=0.00031544, gnorm=0.173, clip=0, loss_scale=1, train_wall=89, gb_free=21.2, wall=37220 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1514 / 1689 loss=3.708, nll_loss=2.175, ppl=4.51, wps=551213, ups=1.11, wpb=495209, bsz=16564.4, num_updates=40300, lr=0.000315049, gnorm=0.174, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=37310 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 epoch 024: 1614 / 1689 loss=3.704, nll_loss=2.171, ppl=4.5, wps=552741, ups=1.11, wpb=496241, bsz=16528.7, num_updates=40400, lr=0.000314658, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=37400 end of epoch 24 (average epoch stats below) epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 epoch 024 | loss 3.698 | nll_loss 2.163 | ppl 4.48 | wps 531903 | ups 1.07 | wpb 495093 | bsz 16503.4 | num_updates 40475 | lr 0.000314367 | gnorm 0.173 | clip 0 | loss_scale 2 | train_wall 1512 | gb_free 22.9 | wall 37467 Start iterating over samples epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 25 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=542667, ups=1.11, wpb=490853, bsz=16154.9, num_updates=40500, lr=0.00031427, gnorm=0.183, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37490 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 125 / 1689 loss=3.686, nll_loss=2.149, ppl=4.44, wps=549470, ups=1.11, wpb=494929, bsz=16283, num_updates=40600, lr=0.000313882, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.2, wall=37580 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 225 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=552553, ups=1.11, wpb=496674, bsz=16527.4, num_updates=40700, lr=0.000313497, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.6, wall=37670 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 325 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549774, ups=1.11, wpb=496439, bsz=16687.4, num_updates=40800, lr=0.000313112, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=37761 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 425 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=550719, ups=1.11, wpb=495442, bsz=16509.1, num_updates=40900, lr=0.000312729, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=37851 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 epoch 025: 525 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=548585, ups=1.11, wpb=495019, bsz=16606.4, num_updates=41000, lr=0.000312348, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=20.8, wall=37941 begin validation on "valid" subset epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.717 | nll_loss 2.155 | ppl 4.45 | wps 0 | wpb 44526 | bsz 2008 | num_updates 41000 | best_loss 3.717 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 625 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=205837, ups=0.42, wpb=494814, bsz=16428.4, num_updates=41100, lr=0.000311967, gnorm=0.167, clip=0, loss_scale=4, train_wall=207, gb_free=20.9, wall=38181 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 725 / 1689 loss=3.696, nll_loss=2.161, ppl=4.47, wps=557220, ups=1.12, wpb=495824, bsz=16796.2, num_updates=41200, lr=0.000311588, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=20.6, wall=38270 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 826 / 1689 loss=3.699, nll_loss=2.164, ppl=4.48, wps=545231, ups=1.1, wpb=495203, bsz=16273.5, num_updates=41300, lr=0.000311211, gnorm=0.174, clip=0, loss_scale=4, train_wall=90, gb_free=21.5, wall=38361 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 926 / 1689 loss=3.702, nll_loss=2.168, ppl=4.49, wps=548941, ups=1.11, wpb=494622, bsz=16914.9, num_updates=41400, lr=0.000310835, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21.3, wall=38451 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1026 / 1689 loss=3.695, nll_loss=2.16, ppl=4.47, wps=548754, ups=1.11, wpb=494257, bsz=16506.6, num_updates=41500, lr=0.00031046, gnorm=0.164, clip=0, loss_scale=4, train_wall=89, gb_free=21.1, wall=38541 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1126 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=552807, ups=1.12, wpb=494556, bsz=16186.7, num_updates=41600, lr=0.000310087, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=38631 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1226 / 1689 loss=3.698, nll_loss=2.164, ppl=4.48, wps=552900, ups=1.12, wpb=495120, bsz=16343, num_updates=41700, lr=0.000309715, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=38720 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1326 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=549680, ups=1.11, wpb=497343, bsz=16624.6, num_updates=41800, lr=0.000309344, gnorm=0.165, clip=0, loss_scale=8, train_wall=89, gb_free=22.1, wall=38811 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1428 / 1689 loss=3.693, nll_loss=2.158, ppl=4.46, wps=537870, ups=1.09, wpb=494775, bsz=16463.3, num_updates=41900, lr=0.000308975, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=38903 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 epoch 025: 1528 / 1689 loss=3.697, nll_loss=2.163, ppl=4.48, wps=545590, ups=1.1, wpb=494243, bsz=16800.6, num_updates=42000, lr=0.000308607, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=38993 begin validation on "valid" subset epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025 | valid on 'valid' subset | loss 3.724 | nll_loss 2.167 | ppl 4.49 | wps 0 | wpb 44526 | bsz 2008 | num_updates 42000 | best_loss 3.717 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 epoch 025: 1628 / 1689 loss=3.694, nll_loss=2.159, ppl=4.47, wps=476071, ups=0.96, wpb=495489, bsz=16276.9, num_updates=42100, lr=0.00030824, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=20.4, wall=39097 end of epoch 25 (average epoch stats below) epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 epoch 025 | loss 3.693 | nll_loss 2.158 | ppl 4.46 | wps 495464 | ups 1 | wpb 495107 | bsz 16508 | num_updates 42161 | lr 0.000308017 | gnorm 0.17 | clip 0 | loss_scale 2 | train_wall 1614 | gb_free 23.6 | wall 39152 Start iterating over samples epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 39 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=540612, ups=1.1, wpb=491886, bsz=16503.8, num_updates=42200, lr=0.000307875, gnorm=0.174, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=39188 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 139 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=551781, ups=1.11, wpb=495266, bsz=16539.5, num_updates=42300, lr=0.00030751, gnorm=0.16, clip=0, loss_scale=2, train_wall=88, gb_free=20.3, wall=39278 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 239 / 1689 loss=3.685, nll_loss=2.149, ppl=4.44, wps=551601, ups=1.11, wpb=495855, bsz=16865.8, num_updates=42400, lr=0.000307148, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=22.2, wall=39368 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 339 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=545773, ups=1.1, wpb=494648, bsz=16738.4, num_updates=42500, lr=0.000306786, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=39459 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 439 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=546952, ups=1.11, wpb=494679, bsz=16171.7, num_updates=42600, lr=0.000306426, gnorm=0.175, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=39549 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 539 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=546981, ups=1.1, wpb=495568, bsz=16572.7, num_updates=42700, lr=0.000306067, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=39640 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 639 / 1689 loss=3.689, nll_loss=2.153, ppl=4.45, wps=552742, ups=1.12, wpb=495365, bsz=16146.4, num_updates=42800, lr=0.000305709, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.8, wall=39729 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 740 / 1689 loss=3.688, nll_loss=2.152, ppl=4.44, wps=543805, ups=1.1, wpb=496283, bsz=16687.9, num_updates=42900, lr=0.000305352, gnorm=0.178, clip=0, loss_scale=4, train_wall=90, gb_free=21.3, wall=39821 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 epoch 026: 840 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=549775, ups=1.11, wpb=494928, bsz=16471.8, num_updates=43000, lr=0.000304997, gnorm=0.178, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=39911 begin validation on "valid" subset epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026 | valid on 'valid' subset | loss 3.717 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 43000 | best_loss 3.717 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 941 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=404845, ups=0.82, wpb=495842, bsz=16548.6, num_updates=43100, lr=0.000304643, gnorm=0.167, clip=0, loss_scale=2, train_wall=96, gb_free=21.3, wall=40033 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1041 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=552474, ups=1.12, wpb=495088, bsz=16314.5, num_updates=43200, lr=0.00030429, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=40123 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1141 / 1689 loss=3.691, nll_loss=2.156, ppl=4.46, wps=549836, ups=1.11, wpb=495857, bsz=16995.5, num_updates=43300, lr=0.000303939, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=40213 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1241 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=553210, ups=1.12, wpb=495438, bsz=16234.1, num_updates=43400, lr=0.000303588, gnorm=0.17, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=40303 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1341 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550963, ups=1.11, wpb=495567, bsz=16483.4, num_updates=43500, lr=0.000303239, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=40392 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1441 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=552422, ups=1.11, wpb=495576, bsz=16606, num_updates=43600, lr=0.000302891, gnorm=0.168, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=40482 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1541 / 1689 loss=3.697, nll_loss=2.162, ppl=4.48, wps=547424, ups=1.11, wpb=494680, bsz=16358.2, num_updates=43700, lr=0.000302545, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=40573 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 epoch 026: 1641 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=546185, ups=1.1, wpb=495303, bsz=16472.3, num_updates=43800, lr=0.000302199, gnorm=0.173, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=40663 end of epoch 26 (average epoch stats below) epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 epoch 026 | loss 3.689 | nll_loss 2.153 | ppl 4.45 | wps 537470 | ups 1.09 | wpb 495107 | bsz 16507.8 | num_updates 43848 | lr 0.000302033 | gnorm 0.17 | clip 0 | loss_scale 4 | train_wall 1501 | gb_free 22.8 | wall 40706 Start iterating over samples epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 53 / 1689 loss=3.677, nll_loss=2.14, ppl=4.41, wps=531564, ups=1.08, wpb=490307, bsz=16049, num_updates=43900, lr=0.000301855, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=40755 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 epoch 027: 153 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=550309, ups=1.11, wpb=495877, bsz=16485.4, num_updates=44000, lr=0.000301511, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=40846 begin validation on "valid" subset epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027 | valid on 'valid' subset | loss 3.716 | nll_loss 2.156 | ppl 4.46 | wps 0 | wpb 44526 | bsz 2008 | num_updates 44000 | best_loss 3.716 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 254 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=453986, ups=0.92, wpb=495924, bsz=16078.3, num_updates=44100, lr=0.000301169, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=19.5, wall=40955 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 354 / 1689 loss=3.678, nll_loss=2.141, ppl=4.41, wps=549672, ups=1.11, wpb=495599, bsz=16519.5, num_updates=44200, lr=0.000300828, gnorm=0.165, clip=0, loss_scale=1, train_wall=88, gb_free=21.5, wall=41045 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 454 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=550414, ups=1.11, wpb=496151, bsz=16400.8, num_updates=44300, lr=0.000300489, gnorm=0.174, clip=0, loss_scale=1, train_wall=88, gb_free=21.7, wall=41135 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 554 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=557460, ups=1.12, wpb=496654, bsz=16295.4, num_updates=44400, lr=0.00030015, gnorm=0.167, clip=0, loss_scale=1, train_wall=88, gb_free=21.3, wall=41224 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 654 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=545678, ups=1.11, wpb=493767, bsz=16663.2, num_updates=44500, lr=0.000299813, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=41315 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 754 / 1689 loss=3.685, nll_loss=2.15, ppl=4.44, wps=552392, ups=1.12, wpb=493912, bsz=16785.7, num_updates=44600, lr=0.000299476, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=41404 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 854 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=551482, ups=1.11, wpb=494896, bsz=16742.2, num_updates=44700, lr=0.000299141, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=41494 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 954 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=555841, ups=1.12, wpb=495320, bsz=16196.5, num_updates=44800, lr=0.000298807, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.7, wall=41583 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1054 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=552334, ups=1.12, wpb=494216, bsz=16841.9, num_updates=44900, lr=0.000298474, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=41672 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 epoch 027: 1154 / 1689 loss=3.687, nll_loss=2.151, ppl=4.44, wps=551500, ups=1.11, wpb=494712, bsz=17057.3, num_updates=45000, lr=0.000298142, gnorm=0.17, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=41762 begin validation on "valid" subset epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027 | valid on 'valid' subset | loss 3.709 | nll_loss 2.151 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 45000 | best_loss 3.709 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1254 / 1689 loss=3.692, nll_loss=2.157, ppl=4.46, wps=406635, ups=0.82, wpb=495762, bsz=16263.4, num_updates=45100, lr=0.000297812, gnorm=0.174, clip=0, loss_scale=4, train_wall=93, gb_free=21.6, wall=41884 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1355 / 1689 loss=3.688, nll_loss=2.153, ppl=4.45, wps=553599, ups=1.12, wpb=496053, bsz=16483.9, num_updates=45200, lr=0.000297482, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=41974 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1455 / 1689 loss=3.687, nll_loss=2.152, ppl=4.45, wps=556477, ups=1.12, wpb=496269, bsz=16355.1, num_updates=45300, lr=0.000297154, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42063 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1555 / 1689 loss=3.693, nll_loss=2.159, ppl=4.46, wps=552602, ups=1.12, wpb=494668, bsz=16408.2, num_updates=45400, lr=0.000296826, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=20, wall=42152 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 epoch 027: 1655 / 1689 loss=3.686, nll_loss=2.151, ppl=4.44, wps=548240, ups=1.11, wpb=495668, bsz=16804, num_updates=45500, lr=0.0002965, gnorm=0.17, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=42243 end of epoch 27 (average epoch stats below) epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 epoch 027 | loss 3.684 | nll_loss 2.149 | ppl 4.43 | wps 532914 | ups 1.08 | wpb 495120 | bsz 16505.9 | num_updates 45534 | lr 0.000296389 | gnorm 0.167 | clip 0 | loss_scale 2 | train_wall 1498 | gb_free 23.1 | wall 42272 Start iterating over samples epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 66 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=541719, ups=1.1, wpb=490911, bsz=16644.7, num_updates=45600, lr=0.000296174, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42333 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 166 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=553364, ups=1.11, wpb=497428, bsz=16306.6, num_updates=45700, lr=0.00029585, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42423 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 266 / 1689 loss=3.676, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.11, wpb=495028, bsz=16492.3, num_updates=45800, lr=0.000295527, gnorm=0.172, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=42513 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 367 / 1689 loss=3.679, nll_loss=2.142, ppl=4.41, wps=545108, ups=1.1, wpb=494556, bsz=16480.7, num_updates=45900, lr=0.000295205, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=42604 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 epoch 028: 467 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=547481, ups=1.1, wpb=496460, bsz=16696.2, num_updates=46000, lr=0.000294884, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=42695 begin validation on "valid" subset epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.727 | nll_loss 2.17 | ppl 4.5 | wps 0 | wpb 44526 | bsz 2008 | num_updates 46000 | best_loss 3.709 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 567 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=476764, ups=0.96, wpb=494300, bsz=17089.4, num_updates=46100, lr=0.000294564, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=42798 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 667 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=550294, ups=1.11, wpb=495755, bsz=16555.3, num_updates=46200, lr=0.000294245, gnorm=0.177, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=42888 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 767 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=551139, ups=1.11, wpb=496659, bsz=16416.1, num_updates=46300, lr=0.000293927, gnorm=0.156, clip=0, loss_scale=2, train_wall=89, gb_free=20.9, wall=42979 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 867 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=551874, ups=1.11, wpb=496118, bsz=16491.4, num_updates=46400, lr=0.00029361, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=22.6, wall=43068 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 967 / 1689 loss=3.686, nll_loss=2.15, ppl=4.44, wps=550172, ups=1.11, wpb=495111, bsz=16363.5, num_updates=46500, lr=0.000293294, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=20.7, wall=43158 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1067 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=549067, ups=1.11, wpb=496123, bsz=16220.1, num_updates=46600, lr=0.000292979, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=43249 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1167 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550492, ups=1.11, wpb=496889, bsz=16300.3, num_updates=46700, lr=0.000292666, gnorm=0.179, clip=0, loss_scale=4, train_wall=88, gb_free=20.4, wall=43339 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1268 / 1689 loss=3.684, nll_loss=2.149, ppl=4.43, wps=541647, ups=1.1, wpb=494584, bsz=16246.5, num_updates=46800, lr=0.000292353, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=43430 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1368 / 1689 loss=3.68, nll_loss=2.145, ppl=4.42, wps=551767, ups=1.11, wpb=496661, bsz=16595, num_updates=46900, lr=0.000292041, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=43520 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 epoch 028: 1468 / 1689 loss=3.687, nll_loss=2.152, ppl=4.44, wps=549999, ups=1.11, wpb=494447, bsz=16655, num_updates=47000, lr=0.00029173, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=43610 begin validation on "valid" subset epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028 | valid on 'valid' subset | loss 3.722 | nll_loss 2.163 | ppl 4.48 | wps 0 | wpb 44526 | bsz 2008 | num_updates 47000 | best_loss 3.709 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1568 / 1689 loss=3.689, nll_loss=2.154, ppl=4.45, wps=394098, ups=0.8, wpb=494344, bsz=16402.9, num_updates=47100, lr=0.00029142, gnorm=0.169, clip=0, loss_scale=2, train_wall=95, gb_free=21.8, wall=43736 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 epoch 028: 1668 / 1689 loss=3.69, nll_loss=2.155, ppl=4.45, wps=558442, ups=1.13, wpb=492731, bsz=16673.5, num_updates=47200, lr=0.000291111, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.1, wall=43824 end of epoch 28 (average epoch stats below) epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 epoch 028 | loss 3.681 | nll_loss 2.144 | ppl 4.42 | wps 532182 | ups 1.07 | wpb 495108 | bsz 16499.4 | num_updates 47221 | lr 0.000291047 | gnorm 0.168 | clip 0 | loss_scale 4 | train_wall 1502 | gb_free 22.4 | wall 43842 Start iterating over samples epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 79 / 1689 loss=3.675, nll_loss=2.138, ppl=4.4, wps=550543, ups=1.12, wpb=490833, bsz=16485.7, num_updates=47300, lr=0.000290803, gnorm=0.171, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=43913 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 179 / 1689 loss=3.671, nll_loss=2.133, ppl=4.39, wps=554471, ups=1.12, wpb=495348, bsz=16988.9, num_updates=47400, lr=0.000290496, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=22, wall=44002 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 279 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=552084, ups=1.12, wpb=495084, bsz=16318.2, num_updates=47500, lr=0.000290191, gnorm=0.161, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=44092 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 379 / 1689 loss=3.676, nll_loss=2.139, ppl=4.4, wps=553374, ups=1.12, wpb=494518, bsz=15985.2, num_updates=47600, lr=0.000289886, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=44182 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 479 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=550407, ups=1.11, wpb=495724, bsz=16683.8, num_updates=47700, lr=0.000289581, gnorm=0.168, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=44272 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 580 / 1689 loss=3.672, nll_loss=2.134, ppl=4.39, wps=541247, ups=1.09, wpb=494558, bsz=16891.4, num_updates=47800, lr=0.000289278, gnorm=0.165, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=44363 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 680 / 1689 loss=3.676, nll_loss=2.139, ppl=4.41, wps=554545, ups=1.12, wpb=496975, bsz=16584.4, num_updates=47900, lr=0.000288976, gnorm=0.178, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=44453 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 epoch 029: 781 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=541630, ups=1.09, wpb=496549, bsz=16368, num_updates=48000, lr=0.000288675, gnorm=0.167, clip=0, loss_scale=2, train_wall=90, gb_free=17.9, wall=44544 begin validation on "valid" subset epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029 | valid on 'valid' subset | loss 3.702 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 48000 | best_loss 3.702 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 881 / 1689 loss=3.684, nll_loss=2.148, ppl=4.43, wps=443788, ups=0.9, wpb=492493, bsz=16671.3, num_updates=48100, lr=0.000288375, gnorm=0.176, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=44655 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 981 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=545985, ups=1.11, wpb=493165, bsz=16486.2, num_updates=48200, lr=0.000288076, gnorm=0.179, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=44746 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1081 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=548214, ups=1.1, wpb=496194, bsz=16350.9, num_updates=48300, lr=0.000287777, gnorm=0.163, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=44836 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1181 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=544417, ups=1.1, wpb=496552, bsz=16761.9, num_updates=48400, lr=0.00028748, gnorm=0.16, clip=0, loss_scale=2, train_wall=90, gb_free=21.3, wall=44927 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1281 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=552060, ups=1.11, wpb=496044, bsz=16484.4, num_updates=48500, lr=0.000287183, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=45017 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1381 / 1689 loss=3.688, nll_loss=2.154, ppl=4.45, wps=553803, ups=1.11, wpb=496717, bsz=16559.7, num_updates=48600, lr=0.000286888, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=22.3, wall=45107 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1481 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=551193, ups=1.11, wpb=494716, bsz=16443.8, num_updates=48700, lr=0.000286593, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=20, wall=45197 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1582 / 1689 loss=3.682, nll_loss=2.146, ppl=4.43, wps=541078, ups=1.1, wpb=494042, bsz=16209.3, num_updates=48800, lr=0.000286299, gnorm=0.162, clip=0, loss_scale=2, train_wall=90, gb_free=22, wall=45288 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 epoch 029: 1682 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=550649, ups=1.11, wpb=496510, bsz=16245.8, num_updates=48900, lr=0.000286006, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.1, wall=45378 end of epoch 29 (average epoch stats below) epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 epoch 029 | loss 3.677 | nll_loss 2.14 | ppl 4.41 | wps 541397 | ups 1.09 | wpb 495104 | bsz 16501.7 | num_updates 48907 | lr 0.000285986 | gnorm 0.168 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 25.7 | wall 45384 Start iterating over samples epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 epoch 030: 93 / 1689 loss=3.666, nll_loss=2.128, ppl=4.37, wps=534730, ups=1.09, wpb=491302, bsz=16485.7, num_updates=49000, lr=0.000285714, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=21.3, wall=45470 begin validation on "valid" subset epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.711 | nll_loss 2.149 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 49000 | best_loss 3.702 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 193 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=427004, ups=0.86, wpb=494984, bsz=16227.8, num_updates=49100, lr=0.000285423, gnorm=0.169, clip=0, loss_scale=2, train_wall=97, gb_free=21.1, wall=45586 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 293 / 1689 loss=3.669, nll_loss=2.131, ppl=4.38, wps=552618, ups=1.11, wpb=496139, bsz=16367.2, num_updates=49200, lr=0.000285133, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=22.7, wall=45676 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 393 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=558202, ups=1.13, wpb=494948, bsz=16342.9, num_updates=49300, lr=0.000284844, gnorm=0.166, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=45764 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 493 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=561615, ups=1.13, wpb=496319, bsz=16759, num_updates=49400, lr=0.000284555, gnorm=0.173, clip=0, loss_scale=4, train_wall=88, gb_free=21.3, wall=45853 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 593 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=551052, ups=1.11, wpb=495912, bsz=16545.9, num_updates=49500, lr=0.000284268, gnorm=0.169, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=45943 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 693 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=553510, ups=1.12, wpb=495786, bsz=16485.1, num_updates=49600, lr=0.000283981, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.2, wall=46032 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 793 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=552045, ups=1.11, wpb=496115, bsz=16246.7, num_updates=49700, lr=0.000283695, gnorm=0.165, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46122 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 894 / 1689 loss=3.674, nll_loss=2.138, ppl=4.4, wps=545197, ups=1.1, wpb=494881, bsz=16516.2, num_updates=49800, lr=0.00028341, gnorm=0.162, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=46213 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 994 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=550974, ups=1.11, wpb=494784, bsz=16659.7, num_updates=49900, lr=0.000283126, gnorm=0.165, clip=0, loss_scale=4, train_wall=88, gb_free=22.1, wall=46303 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 epoch 030: 1094 / 1689 loss=3.679, nll_loss=2.143, ppl=4.42, wps=550209, ups=1.11, wpb=494885, bsz=16518.9, num_updates=50000, lr=0.000282843, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.7, wall=46393 begin validation on "valid" subset epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030 | valid on 'valid' subset | loss 3.706 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 50000 | best_loss 3.702 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1194 / 1689 loss=3.683, nll_loss=2.147, ppl=4.43, wps=481706, ups=0.97, wpb=494344, bsz=16845.9, num_updates=50100, lr=0.00028256, gnorm=0.176, clip=0, loss_scale=4, train_wall=89, gb_free=21, wall=46495 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1294 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=548596, ups=1.11, wpb=494238, bsz=16418.3, num_updates=50200, lr=0.000282279, gnorm=0.176, clip=0, loss_scale=4, train_wall=88, gb_free=21.1, wall=46585 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1395 / 1689 loss=3.678, nll_loss=2.142, ppl=4.41, wps=540573, ups=1.09, wpb=495292, bsz=16521.8, num_updates=50300, lr=0.000281998, gnorm=0.176, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=46677 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1495 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=551460, ups=1.11, wpb=495878, bsz=16742.1, num_updates=50400, lr=0.000281718, gnorm=0.159, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=46767 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 epoch 030: 1595 / 1689 loss=3.672, nll_loss=2.136, ppl=4.4, wps=552527, ups=1.11, wpb=496866, bsz=16518.6, num_updates=50500, lr=0.000281439, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=46857 end of epoch 30 (average epoch stats below) epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 epoch 030 | loss 3.673 | nll_loss 2.137 | ppl 4.4 | wps 536603 | ups 1.08 | wpb 495122 | bsz 16505.2 | num_updates 50594 | lr 0.000281177 | gnorm 0.169 | clip 0 | loss_scale 2 | train_wall 1502 | gb_free 23.6 | wall 46940 Start iterating over samples epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 6 / 1689 loss=3.681, nll_loss=2.145, ppl=4.42, wps=538925, ups=1.1, wpb=490414, bsz=16399.8, num_updates=50600, lr=0.000281161, gnorm=0.179, clip=0, loss_scale=2, train_wall=87, gb_free=21.6, wall=46948 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 106 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552084, ups=1.12, wpb=494675, bsz=16431.9, num_updates=50700, lr=0.000280883, gnorm=0.163, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=47038 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 206 / 1689 loss=3.658, nll_loss=2.119, ppl=4.34, wps=546602, ups=1.1, wpb=495390, bsz=16532, num_updates=50800, lr=0.000280607, gnorm=0.167, clip=0, loss_scale=4, train_wall=89, gb_free=22.1, wall=47128 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 306 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=555876, ups=1.12, wpb=495516, bsz=16305.4, num_updates=50900, lr=0.000280331, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47217 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 epoch 031: 406 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=548966, ups=1.11, wpb=493473, bsz=16433.9, num_updates=51000, lr=0.000280056, gnorm=0.172, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=47307 begin validation on "valid" subset epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.708 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 51000 | best_loss 3.702 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 506 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=481603, ups=0.97, wpb=494565, bsz=16802.2, num_updates=51100, lr=0.000279782, gnorm=0.162, clip=0, loss_scale=4, train_wall=88, gb_free=22, wall=47410 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 606 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=555490, ups=1.12, wpb=495461, bsz=16548.2, num_updates=51200, lr=0.000279508, gnorm=0.174, clip=0, loss_scale=4, train_wall=88, gb_free=21.6, wall=47499 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 707 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=552196, ups=1.11, wpb=496069, bsz=16603.7, num_updates=51300, lr=0.000279236, gnorm=0.17, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=47589 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 807 / 1689 loss=3.67, nll_loss=2.132, ppl=4.38, wps=560035, ups=1.13, wpb=496121, bsz=16330.7, num_updates=51400, lr=0.000278964, gnorm=0.159, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=47677 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 907 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=555901, ups=1.12, wpb=495662, bsz=16421.4, num_updates=51500, lr=0.000278693, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.4, wall=47767 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1008 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=547696, ups=1.11, wpb=495284, bsz=16761.5, num_updates=51600, lr=0.000278423, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=22.5, wall=47857 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1108 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=557534, ups=1.12, wpb=496180, bsz=15956.7, num_updates=51700, lr=0.000278154, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=47946 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1208 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552941, ups=1.12, wpb=493579, bsz=16719.8, num_updates=51800, lr=0.000277885, gnorm=0.165, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=48035 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1308 / 1689 loss=3.68, nll_loss=2.144, ppl=4.42, wps=554914, ups=1.12, wpb=495287, bsz=16239, num_updates=51900, lr=0.000277617, gnorm=0.171, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=48125 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 epoch 031: 1408 / 1689 loss=3.672, nll_loss=2.136, ppl=4.39, wps=549834, ups=1.11, wpb=494806, bsz=16785.9, num_updates=52000, lr=0.00027735, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=48215 begin validation on "valid" subset epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031 | valid on 'valid' subset | loss 3.705 | nll_loss 2.145 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 52000 | best_loss 3.702 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1508 / 1689 loss=3.677, nll_loss=2.141, ppl=4.41, wps=485124, ups=0.98, wpb=496860, bsz=16920.3, num_updates=52100, lr=0.000277084, gnorm=0.171, clip=0, loss_scale=4, train_wall=89, gb_free=21.8, wall=48317 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 epoch 031: 1608 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=552405, ups=1.12, wpb=495236, bsz=16416.7, num_updates=52200, lr=0.000276818, gnorm=0.158, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=48407 end of epoch 31 (average epoch stats below) epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 epoch 031 | loss 3.67 | nll_loss 2.133 | ppl 4.39 | wps 542867 | ups 1.1 | wpb 495112 | bsz 16504.6 | num_updates 52281 | lr 0.000276604 | gnorm 0.166 | clip 0 | loss_scale 4 | train_wall 1490 | gb_free 22.7 | wall 48479 Start iterating over samples epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 19 / 1689 loss=3.667, nll_loss=2.129, ppl=4.38, wps=547043, ups=1.11, wpb=493177, bsz=16478.7, num_updates=52300, lr=0.000276553, gnorm=0.169, clip=0, loss_scale=4, train_wall=88, gb_free=21.9, wall=48497 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 121 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=534033, ups=1.08, wpb=494795, bsz=16780.2, num_updates=52400, lr=0.000276289, gnorm=0.168, clip=0, loss_scale=1, train_wall=91, gb_free=21.1, wall=48589 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 221 / 1689 loss=3.66, nll_loss=2.121, ppl=4.35, wps=542135, ups=1.1, wpb=494316, bsz=16480.2, num_updates=52500, lr=0.000276026, gnorm=0.153, clip=0, loss_scale=1, train_wall=90, gb_free=22.4, wall=48681 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 321 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=550355, ups=1.11, wpb=495928, bsz=16648.3, num_updates=52600, lr=0.000275764, gnorm=0.164, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=48771 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 421 / 1689 loss=3.659, nll_loss=2.121, ppl=4.35, wps=549997, ups=1.11, wpb=497387, bsz=16576.9, num_updates=52700, lr=0.000275502, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=48861 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 521 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=546347, ups=1.11, wpb=494238, bsz=16820.4, num_updates=52800, lr=0.000275241, gnorm=0.171, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=48952 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 621 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=549810, ups=1.11, wpb=495871, bsz=16469.1, num_updates=52900, lr=0.000274981, gnorm=0.163, clip=0, loss_scale=1, train_wall=88, gb_free=21.1, wall=49042 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 epoch 032: 721 / 1689 loss=3.668, nll_loss=2.13, ppl=4.38, wps=550481, ups=1.11, wpb=494146, bsz=16560.2, num_updates=53000, lr=0.000274721, gnorm=0.176, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=49132 begin validation on "valid" subset epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032 | valid on 'valid' subset | loss 3.701 | nll_loss 2.144 | ppl 4.42 | wps 0 | wpb 44526 | bsz 2008 | num_updates 53000 | best_loss 3.701 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 821 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=458821, ups=0.93, wpb=495100, bsz=16031, num_updates=53100, lr=0.000274462, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=49240 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 922 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=548571, ups=1.11, wpb=495897, bsz=16515.8, num_updates=53200, lr=0.000274204, gnorm=0.17, clip=0, loss_scale=1, train_wall=89, gb_free=21.6, wall=49330 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1023 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=548474, ups=1.11, wpb=496314, bsz=16003.6, num_updates=53300, lr=0.000273947, gnorm=0.155, clip=0, loss_scale=0.5, train_wall=89, gb_free=22.1, wall=49420 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1123 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=548958, ups=1.11, wpb=494835, bsz=16510.5, num_updates=53400, lr=0.00027369, gnorm=0.164, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.8, wall=49511 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1223 / 1689 loss=3.671, nll_loss=2.135, ppl=4.39, wps=550476, ups=1.11, wpb=494484, bsz=16221.8, num_updates=53500, lr=0.000273434, gnorm=0.174, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.6, wall=49600 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1323 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=551534, ups=1.11, wpb=496174, bsz=16116.5, num_updates=53600, lr=0.000273179, gnorm=0.169, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.7, wall=49690 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1423 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=550348, ups=1.11, wpb=494462, bsz=16830.5, num_updates=53700, lr=0.000272925, gnorm=0.17, clip=0, loss_scale=0.5, train_wall=88, gb_free=20.5, wall=49780 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1523 / 1689 loss=3.67, nll_loss=2.133, ppl=4.39, wps=552475, ups=1.11, wpb=496851, bsz=16677.4, num_updates=53800, lr=0.000272671, gnorm=0.172, clip=0, loss_scale=1, train_wall=88, gb_free=22.8, wall=49870 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 epoch 032: 1623 / 1689 loss=3.673, nll_loss=2.137, ppl=4.4, wps=547567, ups=1.11, wpb=494392, bsz=16443, num_updates=53900, lr=0.000272418, gnorm=0.161, clip=0, loss_scale=1, train_wall=88, gb_free=22.2, wall=49960 end of epoch 32 (average epoch stats below) epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 epoch 032 | loss 3.667 | nll_loss 2.129 | ppl 4.37 | wps 541573 | ups 1.09 | wpb 495134 | bsz 16502.5 | num_updates 53966 | lr 0.000272251 | gnorm 0.166 | clip 0 | loss_scale 1 | train_wall 1493 | gb_free 22.5 | wall 50019 Start iterating over samples epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 epoch 033: 34 / 1689 loss=3.671, nll_loss=2.134, ppl=4.39, wps=385424, ups=0.78, wpb=491936, bsz=16573, num_updates=54000, lr=0.000272166, gnorm=0.162, clip=0, loss_scale=1, train_wall=99, gb_free=21.4, wall=50088 begin validation on "valid" subset epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.706 | nll_loss 2.142 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 54000 | best_loss 3.701 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 134 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=367230, ups=0.74, wpb=495051, bsz=16581.6, num_updates=54100, lr=0.000271914, gnorm=0.168, clip=0, loss_scale=1, train_wall=116, gb_free=20.9, wall=50223 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 234 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=550710, ups=1.11, wpb=496821, bsz=16987.5, num_updates=54200, lr=0.000271663, gnorm=0.16, clip=0, loss_scale=1, train_wall=88, gb_free=20.9, wall=50313 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 334 / 1689 loss=3.654, nll_loss=2.114, ppl=4.33, wps=546958, ups=1.11, wpb=494154, bsz=16198.3, num_updates=54300, lr=0.000271413, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=20.8, wall=50403 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 434 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=553964, ups=1.12, wpb=494630, bsz=16446.1, num_updates=54400, lr=0.000271163, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=50493 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 534 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554624, ups=1.12, wpb=496262, bsz=16685.6, num_updates=54500, lr=0.000270914, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=50582 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 634 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=551387, ups=1.11, wpb=495352, bsz=16771.7, num_updates=54600, lr=0.000270666, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.4, wall=50672 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 734 / 1689 loss=3.664, nll_loss=2.126, ppl=4.37, wps=552013, ups=1.12, wpb=494197, bsz=16235, num_updates=54700, lr=0.000270418, gnorm=0.168, clip=0, loss_scale=2, train_wall=88, gb_free=22, wall=50762 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 834 / 1689 loss=3.668, nll_loss=2.131, ppl=4.38, wps=547554, ups=1.11, wpb=494047, bsz=16840.2, num_updates=54800, lr=0.000270172, gnorm=0.167, clip=0, loss_scale=4, train_wall=88, gb_free=21.2, wall=50852 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 935 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=547120, ups=1.1, wpb=496507, bsz=15978.1, num_updates=54900, lr=0.000269925, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.4, wall=50943 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 epoch 033: 1035 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=548821, ups=1.11, wpb=495264, bsz=16465.2, num_updates=55000, lr=0.00026968, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=51033 begin validation on "valid" subset epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033 | valid on 'valid' subset | loss 3.712 | nll_loss 2.152 | ppl 4.44 | wps 0 | wpb 44526 | bsz 2008 | num_updates 55000 | best_loss 3.701 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1135 / 1689 loss=3.672, nll_loss=2.135, ppl=4.39, wps=426509, ups=0.86, wpb=494420, bsz=16522, num_updates=55100, lr=0.000269435, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51149 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1235 / 1689 loss=3.676, nll_loss=2.14, ppl=4.41, wps=562354, ups=1.14, wpb=495404, bsz=17047.6, num_updates=55200, lr=0.000269191, gnorm=0.165, clip=0, loss_scale=2, train_wall=87, gb_free=22.4, wall=51237 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1335 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=559964, ups=1.13, wpb=496263, bsz=16432.4, num_updates=55300, lr=0.000268947, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.5, wall=51326 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1435 / 1689 loss=3.674, nll_loss=2.137, ppl=4.4, wps=554833, ups=1.12, wpb=496343, bsz=16769.8, num_updates=55400, lr=0.000268705, gnorm=0.164, clip=0, loss_scale=4, train_wall=88, gb_free=21, wall=51415 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1536 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=545930, ups=1.1, wpb=495498, bsz=16534, num_updates=55500, lr=0.000268462, gnorm=0.168, clip=0, loss_scale=2, train_wall=90, gb_free=22.2, wall=51506 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 epoch 033: 1636 / 1689 loss=3.67, nll_loss=2.134, ppl=4.39, wps=551863, ups=1.12, wpb=494940, bsz=16151.6, num_updates=55600, lr=0.000268221, gnorm=0.166, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=51595 end of epoch 33 (average epoch stats below) epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 epoch 033 | loss 3.664 | nll_loss 2.126 | ppl 4.36 | wps 514873 | ups 1.04 | wpb 495125 | bsz 16509.6 | num_updates 55653 | lr 0.000268093 | gnorm 0.166 | clip 0 | loss_scale 2 | train_wall 1526 | gb_free 23.9 | wall 51642 Start iterating over samples epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 47 / 1689 loss=3.663, nll_loss=2.125, ppl=4.36, wps=549274, ups=1.12, wpb=491315, bsz=16158.5, num_updates=55700, lr=0.00026798, gnorm=0.169, clip=0, loss_scale=2, train_wall=88, gb_free=22.1, wall=51685 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 147 / 1689 loss=3.648, nll_loss=2.109, ppl=4.31, wps=549521, ups=1.11, wpb=496324, bsz=17007.4, num_updates=55800, lr=0.00026774, gnorm=0.165, clip=0, loss_scale=2, train_wall=89, gb_free=21.8, wall=51775 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 247 / 1689 loss=3.652, nll_loss=2.112, ppl=4.32, wps=554030, ups=1.12, wpb=496011, bsz=16021.1, num_updates=55900, lr=0.0002675, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.9, wall=51865 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 epoch 034: 347 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=550058, ups=1.11, wpb=495501, bsz=16304.2, num_updates=56000, lr=0.000267261, gnorm=0.155, clip=0, loss_scale=4, train_wall=88, gb_free=21.5, wall=51955 begin validation on "valid" subset epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.703 | nll_loss 2.147 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 56000 | best_loss 3.701 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 447 / 1689 loss=3.659, nll_loss=2.12, ppl=4.35, wps=469353, ups=0.95, wpb=494592, bsz=16586.1, num_updates=56100, lr=0.000267023, gnorm=0.161, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=52060 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 547 / 1689 loss=3.656, nll_loss=2.117, ppl=4.34, wps=548541, ups=1.11, wpb=495882, bsz=16478.1, num_updates=56200, lr=0.000266785, gnorm=0.166, clip=0, loss_scale=4, train_wall=89, gb_free=21.9, wall=52151 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 647 / 1689 loss=3.667, nll_loss=2.13, ppl=4.38, wps=551556, ups=1.11, wpb=494882, bsz=16657, num_updates=56300, lr=0.000266548, gnorm=0.18, clip=0, loss_scale=4, train_wall=88, gb_free=22.4, wall=52240 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 747 / 1689 loss=3.658, nll_loss=2.12, ppl=4.35, wps=548944, ups=1.11, wpb=496261, bsz=16720.6, num_updates=56400, lr=0.000266312, gnorm=0.16, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=52331 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 847 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=547394, ups=1.11, wpb=494884, bsz=16400.3, num_updates=56500, lr=0.000266076, gnorm=0.16, clip=0, loss_scale=8, train_wall=89, gb_free=21.1, wall=52421 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 948 / 1689 loss=3.661, nll_loss=2.122, ppl=4.35, wps=538531, ups=1.09, wpb=493782, bsz=16655.9, num_updates=56600, lr=0.000265841, gnorm=0.16, clip=0, loss_scale=4, train_wall=90, gb_free=21.2, wall=52513 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1049 / 1689 loss=3.669, nll_loss=2.132, ppl=4.38, wps=540825, ups=1.09, wpb=495080, bsz=16123.3, num_updates=56700, lr=0.000265606, gnorm=0.169, clip=0, loss_scale=2, train_wall=90, gb_free=22.1, wall=52604 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1149 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=551176, ups=1.11, wpb=495329, bsz=16469.5, num_updates=56800, lr=0.000265372, gnorm=0.167, clip=0, loss_scale=2, train_wall=88, gb_free=21.8, wall=52694 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1249 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=549234, ups=1.11, wpb=495328, bsz=16623.9, num_updates=56900, lr=0.000265139, gnorm=0.158, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=52784 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 epoch 034: 1349 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=550595, ups=1.11, wpb=496475, bsz=16754.2, num_updates=57000, lr=0.000264906, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.4, wall=52875 begin validation on "valid" subset epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034 | valid on 'valid' subset | loss 3.701 | nll_loss 2.14 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 57000 | best_loss 3.701 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1449 / 1689 loss=3.673, nll_loss=2.136, ppl=4.4, wps=451335, ups=0.91, wpb=496073, bsz=16457, num_updates=57100, lr=0.000264674, gnorm=0.175, clip=0, loss_scale=2, train_wall=88, gb_free=21.2, wall=52985 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1550 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=540791, ups=1.1, wpb=493547, bsz=16506.6, num_updates=57200, lr=0.000264443, gnorm=0.166, clip=0, loss_scale=2, train_wall=90, gb_free=21.6, wall=53076 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 epoch 034: 1650 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549917, ups=1.11, wpb=495833, bsz=16381.2, num_updates=57300, lr=0.000264212, gnorm=0.159, clip=0, loss_scale=2, train_wall=89, gb_free=20.7, wall=53166 end of epoch 34 (average epoch stats below) epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 epoch 034 | loss 3.661 | nll_loss 2.123 | ppl 4.35 | wps 535630 | ups 1.08 | wpb 495116 | bsz 16506.1 | num_updates 57339 | lr 0.000264122 | gnorm 0.165 | clip 0 | loss_scale 2 | train_wall 1497 | gb_free 22.1 | wall 53200 Start iterating over samples epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 61 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=543418, ups=1.1, wpb=491928, bsz=16352.6, num_updates=57400, lr=0.000263982, gnorm=0.162, clip=0, loss_scale=2, train_wall=87, gb_free=21.4, wall=53256 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 161 / 1689 loss=3.648, nll_loss=2.108, ppl=4.31, wps=547784, ups=1.11, wpb=493902, bsz=16331.8, num_updates=57500, lr=0.000263752, gnorm=0.161, clip=0, loss_scale=2, train_wall=89, gb_free=21.6, wall=53347 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 261 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=551752, ups=1.11, wpb=496763, bsz=16548.1, num_updates=57600, lr=0.000263523, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=53437 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 361 / 1689 loss=3.647, nll_loss=2.107, ppl=4.31, wps=544244, ups=1.1, wpb=496076, bsz=16509.7, num_updates=57700, lr=0.000263295, gnorm=0.17, clip=0, loss_scale=4, train_wall=90, gb_free=21.7, wall=53528 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 461 / 1689 loss=3.661, nll_loss=2.123, ppl=4.36, wps=549743, ups=1.11, wpb=494277, bsz=16480.1, num_updates=57800, lr=0.000263067, gnorm=0.17, clip=0, loss_scale=4, train_wall=88, gb_free=21.7, wall=53618 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 562 / 1689 loss=3.658, nll_loss=2.119, ppl=4.35, wps=547271, ups=1.1, wpb=495331, bsz=16275.5, num_updates=57900, lr=0.00026284, gnorm=0.169, clip=0, loss_scale=2, train_wall=89, gb_free=21, wall=53708 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 epoch 035: 662 / 1689 loss=3.665, nll_loss=2.127, ppl=4.37, wps=552903, ups=1.12, wpb=494976, bsz=16308.3, num_updates=58000, lr=0.000262613, gnorm=0.172, clip=0, loss_scale=2, train_wall=88, gb_free=22.2, wall=53798 begin validation on "valid" subset epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.701 | nll_loss 2.141 | ppl 4.41 | wps 0 | wpb 44526 | bsz 2008 | num_updates 58000 | best_loss 3.701 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 762 / 1689 loss=3.652, nll_loss=2.113, ppl=4.33, wps=455999, ups=0.92, wpb=494805, bsz=16873, num_updates=58100, lr=0.000262387, gnorm=0.167, clip=0, loss_scale=2, train_wall=89, gb_free=22.6, wall=53906 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 862 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=559494, ups=1.13, wpb=496372, bsz=16533.9, num_updates=58200, lr=0.000262161, gnorm=0.163, clip=0, loss_scale=2, train_wall=87, gb_free=20.5, wall=53995 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 962 / 1689 loss=3.662, nll_loss=2.125, ppl=4.36, wps=555459, ups=1.12, wpb=495200, bsz=16623.3, num_updates=58300, lr=0.000261936, gnorm=0.164, clip=0, loss_scale=2, train_wall=88, gb_free=21.4, wall=54084 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1062 / 1689 loss=3.657, nll_loss=2.119, ppl=4.34, wps=554174, ups=1.12, wpb=496145, bsz=16429.4, num_updates=58400, lr=0.000261712, gnorm=0.163, clip=0, loss_scale=4, train_wall=89, gb_free=21.4, wall=54174 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1163 / 1689 loss=3.664, nll_loss=2.127, ppl=4.37, wps=552338, ups=1.11, wpb=496264, bsz=16356.6, num_updates=58500, lr=0.000261488, gnorm=0.168, clip=0, loss_scale=2, train_wall=89, gb_free=22.2, wall=54264 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1263 / 1689 loss=3.653, nll_loss=2.114, ppl=4.33, wps=557116, ups=1.13, wpb=494446, bsz=16306.8, num_updates=58600, lr=0.000261265, gnorm=0.178, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54352 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1363 / 1689 loss=3.663, nll_loss=2.126, ppl=4.36, wps=551187, ups=1.12, wpb=493758, bsz=16426.3, num_updates=58700, lr=0.000261042, gnorm=0.157, clip=0, loss_scale=2, train_wall=89, gb_free=21.9, wall=54442 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1463 / 1689 loss=3.66, nll_loss=2.122, ppl=4.35, wps=555366, ups=1.12, wpb=496211, bsz=16632.7, num_updates=58800, lr=0.00026082, gnorm=0.154, clip=0, loss_scale=2, train_wall=88, gb_free=18.8, wall=54531 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1563 / 1689 loss=3.663, nll_loss=2.126, ppl=4.37, wps=550879, ups=1.11, wpb=495221, bsz=16807.2, num_updates=58900, lr=0.000260599, gnorm=0.162, clip=0, loss_scale=2, train_wall=88, gb_free=20.8, wall=54621 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 epoch 035: 1663 / 1689 loss=3.666, nll_loss=2.129, ppl=4.37, wps=548940, ups=1.11, wpb=495126, bsz=16793.3, num_updates=59000, lr=0.000260378, gnorm=0.157, clip=0, loss_scale=4, train_wall=89, gb_free=21.6, wall=54711 begin validation on "valid" subset epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 epoch 035 | valid on 'valid' subset | loss 3.708 | nll_loss 2.149 | ppl 4.43 | wps 0 | wpb 44526 | bsz 2008 | num_updates 59000 | best_loss 3.701 end of epoch 35 (average epoch stats below) epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 epoch 035 | loss 3.658 | nll_loss 2.12 | ppl 4.35 | wps 528439 | ups 1.07 | wpb 495127 | bsz 16506 | num_updates 59026 | lr 0.00026032 | gnorm 0.165 | clip 0 | loss_scale 4 | train_wall 1523 | gb_free 22.8 | wall 54781 Start iterating over samples epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 74 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=357916, ups=0.73, wpb=491497, bsz=16776.2, num_updates=59100, lr=0.000260157, gnorm=0.16, clip=0, loss_scale=4, train_wall=118, gb_free=21.6, wall=54849 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 175 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=543475, ups=1.1, wpb=493779, bsz=16316.4, num_updates=59200, lr=0.000259938, gnorm=0.173, clip=0, loss_scale=2, train_wall=89, gb_free=21.5, wall=54940 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 276 / 1689 loss=3.649, nll_loss=2.11, ppl=4.32, wps=550406, ups=1.11, wpb=494648, bsz=16223.8, num_updates=59300, lr=0.000259718, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.8, wall=55029 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 376 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=555566, ups=1.12, wpb=496408, bsz=16745.4, num_updates=59400, lr=0.0002595, gnorm=0.165, clip=0, loss_scale=1, train_wall=89, gb_free=22.1, wall=55119 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 476 / 1689 loss=3.644, nll_loss=2.104, ppl=4.3, wps=554890, ups=1.12, wpb=495985, bsz=16627.1, num_updates=59500, lr=0.000259281, gnorm=0.156, clip=0, loss_scale=1, train_wall=89, gb_free=21.9, wall=55208 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 576 / 1689 loss=3.654, nll_loss=2.115, ppl=4.33, wps=554476, ups=1.12, wpb=495190, bsz=16375, num_updates=59600, lr=0.000259064, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=20.9, wall=55297 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 676 / 1689 loss=3.655, nll_loss=2.117, ppl=4.34, wps=546716, ups=1.11, wpb=494026, bsz=16181, num_updates=59700, lr=0.000258847, gnorm=0.166, clip=0, loss_scale=1, train_wall=89, gb_free=21.5, wall=55388 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 776 / 1689 loss=3.662, nll_loss=2.124, ppl=4.36, wps=549301, ups=1.11, wpb=494093, bsz=16347.4, num_updates=59800, lr=0.00025863, gnorm=0.162, clip=0, loss_scale=2, train_wall=89, gb_free=21.3, wall=55478 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 876 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548411, ups=1.11, wpb=494105, bsz=16375.2, num_updates=59900, lr=0.000258414, gnorm=0.164, clip=0, loss_scale=2, train_wall=89, gb_free=22.3, wall=55568 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 epoch 036: 976 / 1689 loss=3.655, nll_loss=2.116, ppl=4.34, wps=548613, ups=1.11, wpb=495296, bsz=16499.8, num_updates=60000, lr=0.000258199, gnorm=0.16, clip=0, loss_scale=2, train_wall=89, gb_free=21.7, wall=55658 Stopping training due to num_updates: 60000 >= max_update: 60000 begin validation on "valid" subset epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 epoch 036 | valid on 'valid' subset | loss 3.696 | nll_loss 2.137 | ppl 4.4 | wps 0 | wpb 44526 | bsz 2008 | num_updates 60000 | best_loss 3.696 end of epoch 36 (average epoch stats below) epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 epoch 036 | loss 3.652 | nll_loss 2.113 | ppl 4.32 | wps 536894 | ups 1.09 | wpb 494792 | bsz 16453.7 | num_updates 60000 | lr 0.000258199 | gnorm 0.164 | clip 0 | loss_scale 2 | train_wall 865 | gb_free 21.7 | wall 55679 done training in 55664.4 seconds